diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6d00377f554cfe2800301419cb1d888f10f800ab
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcbe6ac8e5588bd2dbf6f2fa10826e9efd617af80fa5495358165d8dccfa19c9
+size 90952079
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..314b3ea31b149d73d691efd6f496b97f6a84e1d7
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51942e1cdd4fc4bb9aa2a9649a26fdd2e3822e4eb924e4e55084b8e09a90ea29
+size 90952079
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..26d42273836e83896673b1fe4cd0c1c6fd644004
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10f9cf84a8ccd5c8c1dfa19876aab016341e6c5c44a461108eb4423f345204ec
+size 204403795
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e913b448dd37999501c9e98ec60f8d9d1ca1241
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d37ea8ab9cda46cc28c0964d75caf944b1305770a4bc789c738b99991c8672b8
+size 1815
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..eeb2b771457f443b3d8654d4cb077da3e5cded14
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ed39d3860a4c5b79b338c9f98875ba542b909b8bac0991be734db1360554e02
+size 90952079
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..83a3e825c1453e522c81944f203440d027874e82
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a26da94d90384063a4c8c4d7f9f630cd714d7eb07f25fce0a58ea54182cbb9b8
+size 204403795
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8073af126d00c3679383f16a39784114786f2372
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56b9b1f946e7d9a93adaab132ff2381cef31be146bae4ac5763f3249d98fa378
+size 1815
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6d00377f554cfe2800301419cb1d888f10f800ab
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcbe6ac8e5588bd2dbf6f2fa10826e9efd617af80fa5495358165d8dccfa19c9
+size 90952079
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..808c3563296dc0a7be82da6869671c500c55404f
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db07aed2b90e65be9083d5431d37272028e06d4b92d6ba3067f059976e7ff591
+size 204403795
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..68ac231f9774f7a4a6254684400eb280cd69e9fa
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ac90e9e7304e3e220667308999a777273eaea82ea51cb28003c1fa9e40738a9
+size 1815
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6abb734a07c980ed975f724eb9dad68f2e62731a
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt
@@ -0,0 +1,3 @@
+{"train_lr": 4.2672042852177435e-05, "train_grad_norm": 0.8576727652187944, "train_closs": 1.0961337126687645, "epoch": 0, "val_lr": 4.2672042852177435e-05, "val_grad_norm": 0.8576727652187944, "val_closs": 1.0961337126687645}
+{"train_lr": 2.989280819774688e-05, "train_grad_norm": 0.7565592593381042, "train_closs": 1.0483260756908241, "epoch": 1, "val_lr": 2.989280819774688e-05, "val_grad_norm": 0.7565592593381042, "val_closs": 1.0483260756908241}
+{"train_lr": 9.43437279837357e-06, "train_grad_norm": 0.7707539895124279, "train_closs": 1.0381248756278665, "epoch": 2, "val_lr": 9.43437279837357e-06, "val_grad_norm": 0.7707539895124279, "val_closs": 1.0381248756278665}
diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..3e8df40b9236cc509af3a1865e95527f2edb136a
--- /dev/null
+++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log
@@ -0,0 +1,2801 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 6): env://, gpu 6
+| distributed init (rank 3): env://, gpu 3
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 7): env://, gpu 7
+| distributed init (rank 5): env://, gpu 5
+| distributed init (rank 2): env://, gpu 2
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 4): env://, gpu 4
+[00:02:04.046928] > initializing model parallel with size 1
+[00:02:04.047015] > initializing ddp with size 8
+[00:02:04.047022] > initializing pipeline with size 1
+[00:02:04.226045] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory
+[00:02:04.226122] Namespace(batch_size=16,
+accum_iter=1,
+llama_type='llama_qformerv2_peft',
+llama_config=['../checkpoints/llama2/Llama-2-13b/params.json',
+'configs/model/finetune/sg/llamaPeft_normBiasLora.json'],
+no_visual=False,
+tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model',
+pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/',
+pretrained_type='consolidated',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=3,
+warmup_epochs=0.2,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/mm/alpaca_llava.yaml',
+output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B',
+log_dir='./output_dir',
+save_interval=1,
+only_save_trainable=True,
+device='cuda',
+seed=0,
+resume='',
+num_workers=16,
+pin_mem=True,
+world_size=8,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[00:02:04.226978] Start initialization.
+[00:02:04.227022] ## Processing on RANK 0.
+[00:02:04.237574] Model Args:
+ ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True)
+[00:03:36.399161] build llama model with qformerv2
+[00:03:36.779030] (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /Salesforce/blip2-opt-2.7b/resolve/main/config.json (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))"), '(Request ID: f000589d-f862-41f8-832e-73fc0c96ee6a)')
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:07<00:07,  7.64s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.49s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.81s/it]
+[00:03:49.483027] Model is Peft: True
+[00:03:49.494132] Trainable parameter count : 65141760 (local rank), 65141760 (all).
+[00:03:49.559296] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:04:01.151654] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:04:01.440421] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:01.440595] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:04:01.634902] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:01.635064] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:01.819862] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:01.820024] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.015483] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.015642] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.207043] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:02.207201] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.399467] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:02.399622] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.601728] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:02.601885] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:02.804035] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:02.804196] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:03.006121] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:03.006277] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:03.207422] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:03.207580] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:03.409230] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:03.409384] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:04:03.610181] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:03.610338] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:03.811412] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:03.811572] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.013004] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:04.013164] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.213690] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:04.213848] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.415033] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.415190] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.616188] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.616342] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.817323] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:04.817481] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.018330] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.018487] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.233803] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.233957] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.448379] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.448539] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.663815] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:05.663971] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:05.879084] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:05.879245] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.094576] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.094735] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.310142] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:06.310299] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.525820] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.525978] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.741415] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.741578] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:06.976999] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:06.977156] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:07.203084] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:07.203239] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:07.438748] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:07.438905] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:07.674966] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:07.675127] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:07.901059] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:07.901213] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:08.116317] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:08.116472] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:08.331368] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:08.331521] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:08.546161] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:08.546316] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:08.763036] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:08.763189] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:08.977322] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:08.977476] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:09.192081] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:09.192236] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:09.406398] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:09.406553] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:09.622063] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:09.622219] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:04:09.622378] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:04:09.622495] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:04:24.184283] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:04:24.184487] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:04:24.184607] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:04:24.184731] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:04:36.775233] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:08,  8.15it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:43, 12.86it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:38, 14.37it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:36, 15.02it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<00:57,  9.58it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:09,  7.93it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:16,  7.16it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:02,  8.72it/s]Qunatization Process:   3%|▎         | 17/561 [00:01<00:53, 10.17it/s]Qunatization Process:   3%|▎         | 19/561 [00:01<00:47, 11.49it/s]Qunatization Process:   4%|▎         | 21/561 [00:01<00:43, 12.54it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<00:57,  9.43it/s]Qunatization Process:   4%|▍         | 25/561 [00:02<01:06,  8.02it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:15,  7.07it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:02,  8.53it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<00:53,  9.93it/s]Qunatization Process:   6%|▌         | 33/561 [00:03<00:47, 11.22it/s]Qunatization Process:   6%|▌         | 35/561 [00:03<00:43, 12.22it/s]Qunatization Process:   7%|▋         | 37/561 [00:03<00:59,  8.81it/s]Qunatization Process:   7%|▋         | 39/561 [00:04<01:11,  7.33it/s]Qunatization Process:   7%|▋         | 41/561 [00:04<01:18,  6.58it/s]Qunatization Process:   8%|▊         | 43/561 [00:04<01:05,  7.93it/s]Qunatization Process:   8%|▊         | 45/561 [00:04<00:55,  9.27it/s]Qunatization Process:   8%|▊         | 47/561 [00:05<00:49, 10.48it/s]Qunatization Process:   9%|▊         | 49/561 [00:05<00:44, 11.51it/s]Qunatization Process:   9%|▉         | 51/561 [00:05<01:05,  7.83it/s]Qunatization Process:   9%|▉         | 53/561 [00:06<01:19,  6.40it/s]Qunatization Process:  10%|▉         | 55/561 [00:06<01:29,  5.67it/s]Qunatization Process:  10%|█         | 57/561 [00:06<01:12,  6.97it/s]Qunatization Process:  11%|█         | 59/561 [00:06<01:00,  8.23it/s]Qunatization Process:  11%|█         | 61/561 [00:06<00:52,  9.44it/s]Qunatization Process:  11%|█         | 63/561 [00:07<00:47, 10.53it/s]Qunatization Process:  12%|█▏        | 65/561 [00:07<01:06,  7.49it/s]Qunatization Process:  12%|█▏        | 67/561 [00:07<01:19,  6.23it/s]Qunatization Process:  12%|█▏        | 69/561 [00:08<01:28,  5.58it/s]Qunatization Process:  13%|█▎        | 71/561 [00:08<01:11,  6.81it/s]Qunatization Process:  13%|█▎        | 73/561 [00:08<01:00,  8.06it/s]Qunatization Process:  13%|█▎        | 75/561 [00:08<00:52,  9.28it/s]Qunatization Process:  14%|█▎        | 77/561 [00:08<00:46, 10.35it/s]Qunatization Process:  14%|█▍        | 79/561 [00:09<01:04,  7.43it/s]Qunatization Process:  14%|█▍        | 81/561 [00:09<01:17,  6.21it/s]Qunatization Process:  15%|█▍        | 83/561 [00:10<01:25,  5.57it/s]Qunatization Process:  15%|█▌        | 85/561 [00:10<01:09,  6.81it/s]Qunatization Process:  16%|█▌        | 87/561 [00:10<00:58,  8.08it/s]Qunatization Process:  16%|█▌        | 89/561 [00:10<00:50,  9.29it/s]Qunatization Process:  16%|█▌        | 91/561 [00:10<00:45, 10.39it/s]Qunatization Process:  17%|█▋        | 93/561 [00:11<01:02,  7.46it/s]Qunatization Process:  17%|█▋        | 95/561 [00:11<01:14,  6.22it/s]Qunatization Process:  17%|█▋        | 97/561 [00:12<01:23,  5.58it/s]Qunatization Process:  18%|█▊        | 99/561 [00:12<01:07,  6.82it/s]Qunatization Process:  18%|█▊        | 101/561 [00:12<00:56,  8.08it/s]Qunatization Process:  18%|█▊        | 103/561 [00:12<00:49,  9.31it/s]Qunatization Process:  19%|█▊        | 105/561 [00:12<00:44, 10.36it/s]Qunatization Process:  19%|█▉        | 107/561 [00:13<01:00,  7.45it/s]Qunatization Process:  19%|█▉        | 109/561 [00:13<01:12,  6.21it/s]Qunatization Process:  20%|█▉        | 111/561 [00:14<01:20,  5.58it/s]Qunatization Process:  20%|██        | 113/561 [00:14<01:06,  6.73it/s]Qunatization Process:  20%|██        | 115/561 [00:14<00:55,  7.99it/s]Qunatization Process:  21%|██        | 117/561 [00:14<00:48,  9.21it/s]Qunatization Process:  21%|██        | 119/561 [00:14<00:42, 10.31it/s]Qunatization Process:  22%|██▏       | 121/561 [00:15<00:59,  7.43it/s]Qunatization Process:  22%|██▏       | 123/561 [00:15<01:10,  6.21it/s]Qunatization Process:  22%|██▏       | 125/561 [00:16<01:18,  5.57it/s]Qunatization Process:  23%|██▎       | 127/561 [00:16<01:03,  6.80it/s]Qunatization Process:  23%|██▎       | 129/561 [00:16<00:53,  8.08it/s]Qunatization Process:  23%|██▎       | 131/561 [00:16<00:46,  9.28it/s]Qunatization Process:  24%|██▎       | 133/561 [00:16<00:41, 10.38it/s]Qunatization Process:  24%|██▍       | 135/561 [00:17<00:57,  7.46it/s]Qunatization Process:  24%|██▍       | 137/561 [00:17<01:08,  6.22it/s]Qunatization Process:  25%|██▍       | 139/561 [00:17<01:15,  5.57it/s]Qunatization Process:  25%|██▌       | 141/561 [00:18<01:01,  6.80it/s]Qunatization Process:  25%|██▌       | 143/561 [00:18<00:51,  8.09it/s]Qunatization Process:  26%|██▌       | 145/561 [00:18<00:44,  9.30it/s]Qunatization Process:  26%|██▌       | 147/561 [00:18<00:39, 10.37it/s]Qunatization Process:  27%|██▋       | 149/561 [00:18<00:55,  7.45it/s]Qunatization Process:  27%|██▋       | 151/561 [00:19<01:06,  6.21it/s]Qunatization Process:  27%|██▋       | 153/561 [00:19<01:13,  5.57it/s]Qunatization Process:  28%|██▊       | 155/561 [00:19<00:59,  6.81it/s]Qunatization Process:  28%|██▊       | 157/561 [00:20<00:50,  8.08it/s]Qunatization Process:  28%|██▊       | 159/561 [00:20<00:43,  9.28it/s]Qunatization Process:  29%|██▊       | 161/561 [00:20<00:38, 10.35it/s]Qunatization Process:  29%|██▉       | 163/561 [00:20<00:53,  7.43it/s]Qunatization Process:  29%|██▉       | 165/561 [00:21<01:03,  6.20it/s]Qunatization Process:  30%|██▉       | 167/561 [00:21<01:10,  5.56it/s]Qunatization Process:  30%|███       | 169/561 [00:21<00:57,  6.79it/s]Qunatization Process:  30%|███       | 171/561 [00:22<00:48,  8.06it/s]Qunatization Process:  31%|███       | 173/561 [00:22<00:41,  9.28it/s]Qunatization Process:  31%|███       | 175/561 [00:22<00:37, 10.36it/s]Qunatization Process:  32%|███▏      | 177/561 [00:22<00:51,  7.45it/s]Qunatization Process:  32%|███▏      | 179/561 [00:23<01:01,  6.22it/s]Qunatization Process:  32%|███▏      | 181/561 [00:23<01:08,  5.58it/s]Qunatization Process:  33%|███▎      | 183/561 [00:23<00:55,  6.80it/s]Qunatization Process:  33%|███▎      | 185/561 [00:23<00:46,  8.06it/s]Qunatization Process:  33%|███▎      | 187/561 [00:24<00:40,  9.28it/s]Qunatization Process:  34%|███▎      | 189/561 [00:24<00:35, 10.36it/s]Qunatization Process:  34%|███▍      | 191/561 [00:24<00:49,  7.44it/s]Qunatization Process:  34%|███▍      | 193/561 [00:25<00:59,  6.21it/s]Qunatization Process:  35%|███▍      | 195/561 [00:25<01:05,  5.56it/s]Qunatization Process:  35%|███▌      | 197/561 [00:25<00:53,  6.81it/s]Qunatization Process:  35%|███▌      | 199/561 [00:25<00:44,  8.07it/s]Qunatization Process:  36%|███▌      | 201/561 [00:25<00:38,  9.30it/s]Qunatization Process:  36%|███▌      | 203/561 [00:26<00:34, 10.42it/s]Qunatization Process:  37%|███▋      | 205/561 [00:26<00:47,  7.47it/s]Qunatization Process:  37%|███▋      | 207/561 [00:26<00:57,  6.21it/s]Qunatization Process:  37%|███▋      | 209/561 [00:27<01:03,  5.56it/s]Qunatization Process:  38%|███▊      | 211/561 [00:27<00:51,  6.81it/s]Qunatization Process:  38%|███▊      | 213/561 [00:27<00:43,  8.07it/s]Qunatization Process:  38%|███▊      | 215/561 [00:27<00:37,  9.28it/s]Qunatization Process:  39%|███▊      | 217/561 [00:27<00:33, 10.38it/s]Qunatization Process:  39%|███▉      | 219/561 [00:28<00:45,  7.45it/s]Qunatization Process:  39%|███▉      | 221/561 [00:28<00:54,  6.21it/s]Qunatization Process:  40%|███▉      | 223/561 [00:29<01:00,  5.56it/s]Qunatization Process:  40%|████      | 225/561 [00:29<00:49,  6.80it/s]Qunatization Process:  40%|████      | 227/561 [00:29<00:41,  8.08it/s]Qunatization Process:  41%|████      | 229/561 [00:29<00:35,  9.29it/s]Qunatization Process:  41%|████      | 231/561 [00:29<00:31, 10.41it/s]Qunatization Process:  42%|████▏     | 233/561 [00:30<00:44,  7.45it/s]Qunatization Process:  42%|████▏     | 235/561 [00:30<00:52,  6.22it/s]Qunatization Process:  42%|████▏     | 237/561 [00:31<00:58,  5.57it/s]Qunatization Process:  43%|████▎     | 239/561 [00:31<00:47,  6.81it/s]Qunatization Process:  43%|████▎     | 241/561 [00:31<00:39,  8.09it/s]Qunatization Process:  43%|████▎     | 243/561 [00:31<00:34,  9.26it/s]Qunatization Process:  44%|████▎     | 245/561 [00:31<00:30, 10.35it/s]Qunatization Process:  44%|████▍     | 247/561 [00:32<00:42,  7.43it/s]Qunatization Process:  44%|████▍     | 249/561 [00:32<00:50,  6.21it/s]Qunatization Process:  45%|████▍     | 251/561 [00:33<00:55,  5.56it/s]Qunatization Process:  45%|████▌     | 253/561 [00:33<00:45,  6.81it/s]Qunatization Process:  45%|████▌     | 255/561 [00:33<00:37,  8.08it/s]Qunatization Process:  46%|████▌     | 257/561 [00:33<00:32,  9.28it/s]Qunatization Process:  46%|████▌     | 259/561 [00:33<00:29, 10.35it/s]Qunatization Process:  47%|████▋     | 261/561 [00:34<00:40,  7.43it/s]Qunatization Process:  47%|████▋     | 263/561 [00:34<00:48,  6.20it/s]Qunatization Process:  47%|████▋     | 265/561 [00:35<00:53,  5.55it/s]Qunatization Process:  48%|████▊     | 267/561 [00:35<00:43,  6.79it/s]Qunatization Process:  48%|████▊     | 269/561 [00:35<00:36,  8.07it/s]Qunatization Process:  48%|████▊     | 271/561 [00:35<00:31,  9.30it/s]Qunatization Process:  49%|████▊     | 273/561 [00:35<00:27, 10.36it/s]Qunatization Process:  49%|████▉     | 275/561 [00:36<00:38,  7.51it/s]Qunatization Process:  49%|████▉     | 277/561 [00:36<00:44,  6.36it/s]Qunatization Process:  50%|████▉     | 279/561 [00:36<00:49,  5.75it/s]Qunatization Process:  50%|█████     | 281/561 [00:37<00:39,  7.06it/s]Qunatization Process:  50%|█████     | 283/561 [00:37<00:33,  8.37it/s]Qunatization Process:  51%|█████     | 285/561 [00:37<00:28,  9.67it/s]Qunatization Process:  51%|█████     | 287/561 [00:37<00:25, 10.84it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:37<00:34,  7.80it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:38<00:41,  6.50it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:38<00:46,  5.72it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:38<00:38,  6.97it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:38<00:32,  8.22it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:39<00:27,  9.40it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:39<00:24, 10.46it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:39<00:34,  7.47it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:40<00:41,  6.22it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:40<00:45,  5.57it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:40<00:37,  6.81it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:40<00:31,  8.06it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:41<00:26,  9.28it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:41<00:23, 10.39it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:41<00:32,  7.46it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:42<00:38,  6.22it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:42<00:43,  5.57it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:42<00:34,  6.81it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:42<00:29,  8.08it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:42<00:25,  9.26it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:43<00:22, 10.34it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:43<00:30,  7.43it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:43<00:36,  6.21it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:44<00:40,  5.56it/s]Qunatization Process:  60%|██████    | 337/561 [00:44<00:32,  6.81it/s]Qunatization Process:  60%|██████    | 339/561 [00:44<00:27,  8.06it/s]Qunatization Process:  61%|██████    | 341/561 [00:44<00:23,  9.23it/s]Qunatization Process:  61%|██████    | 343/561 [00:44<00:21, 10.29it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:45<00:29,  7.42it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:45<00:34,  6.21it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:46<00:38,  5.57it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:46<00:30,  6.81it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:46<00:25,  8.08it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:46<00:22,  9.27it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:46<00:19, 10.38it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:47<00:27,  7.45it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:47<00:32,  6.22it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:48<00:35,  5.57it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:48<00:28,  6.81it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:48<00:24,  8.04it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:48<00:20,  9.25it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:48<00:18, 10.30it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:49<00:25,  7.42it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:49<00:29,  6.20it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:50<00:33,  5.57it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:50<00:26,  6.81it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:50<00:22,  8.08it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:50<00:19,  9.29it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:50<00:16, 10.36it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:51<00:23,  7.44it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:51<00:27,  6.21it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:52<00:30,  5.56it/s]Qunatization Process:  70%|███████   | 393/561 [00:52<00:24,  6.81it/s]Qunatization Process:  70%|███████   | 395/561 [00:52<00:20,  8.10it/s]Qunatization Process:  71%|███████   | 397/561 [00:52<00:17,  9.32it/s]Qunatization Process:  71%|███████   | 399/561 [00:52<00:15, 10.41it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:53<00:21,  7.44it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:53<00:25,  6.21it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:53<00:28,  5.56it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:54<00:22,  6.79it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:54<00:18,  8.05it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:54<00:16,  9.27it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:54<00:14, 10.38it/s]Qunatization Process:  74%|███████▍  | 415/561 [00:54<00:19,  7.45it/s]Qunatization Process:  74%|███████▍  | 417/561 [00:55<00:23,  6.21it/s]Qunatization Process:  75%|███████▍  | 419/561 [00:55<00:25,  5.56it/s]Qunatization Process:  75%|███████▌  | 421/561 [00:55<00:20,  6.80it/s]Qunatization Process:  75%|███████▌  | 423/561 [00:56<00:17,  8.04it/s]Qunatization Process:  76%|███████▌  | 425/561 [00:56<00:14,  9.24it/s]Qunatization Process:  76%|███████▌  | 427/561 [00:56<00:12, 10.32it/s]Qunatization Process:  76%|███████▋  | 429/561 [00:56<00:17,  7.43it/s]Qunatization Process:  77%|███████▋  | 431/561 [00:57<00:20,  6.20it/s]Qunatization Process:  77%|███████▋  | 433/561 [00:57<00:23,  5.55it/s]Qunatization Process:  78%|███████▊  | 435/561 [00:57<00:18,  6.79it/s]Qunatization Process:  78%|███████▊  | 437/561 [00:58<00:15,  8.06it/s]Qunatization Process:  78%|███████▊  | 439/561 [00:58<00:13,  9.30it/s]Qunatization Process:  79%|███████▊  | 441/561 [00:58<00:11, 10.37it/s]Qunatization Process:  79%|███████▉  | 443/561 [00:58<00:15,  7.44it/s]Qunatization Process:  79%|███████▉  | 445/561 [00:59<00:18,  6.22it/s]Qunatization Process:  80%|███████▉  | 447/561 [00:59<00:20,  5.57it/s]Qunatization Process:  80%|████████  | 449/561 [00:59<00:16,  6.79it/s]Qunatization Process:  80%|████████  | 451/561 [00:59<00:13,  8.02it/s]Qunatization Process:  81%|████████  | 453/561 [01:00<00:11,  9.21it/s]Qunatization Process:  81%|████████  | 455/561 [01:00<00:10, 10.32it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:00<00:14,  7.42it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:01<00:16,  6.21it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:01<00:17,  5.56it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:01<00:14,  6.78it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:01<00:11,  8.04it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:01<00:10,  9.18it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:02<00:08, 10.27it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:02<00:12,  7.42it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:02<00:14,  6.21it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:03<00:15,  5.57it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:03<00:12,  6.81it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:03<00:10,  8.07it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:03<00:08,  9.28it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:03<00:07, 10.38it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:04<00:10,  7.44it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:04<00:11,  6.21it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:05<00:12,  5.57it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:05<00:10,  6.80it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:05<00:08,  8.07it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:05<00:07,  9.29it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:05<00:06, 10.42it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:06<00:08,  7.44it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:06<00:09,  6.21it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:07<00:10,  5.57it/s]Qunatization Process:  90%|█████████ | 505/561 [01:07<00:08,  6.79it/s]Qunatization Process:  90%|█████████ | 507/561 [01:07<00:06,  8.07it/s]Qunatization Process:  91%|█████████ | 509/561 [01:07<00:05,  9.28it/s]Qunatization Process:  91%|█████████ | 511/561 [01:07<00:04, 10.35it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:08<00:06,  7.45it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:08<00:07,  6.21it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:09<00:07,  5.57it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:09<00:06,  6.72it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:09<00:05,  7.93it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:09<00:04,  9.16it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:09<00:03, 10.26it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:10<00:04,  7.39it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:10<00:05,  6.18it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:11<00:05,  5.56it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:11<00:04,  6.80it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:11<00:03,  8.07it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:11<00:02,  9.29it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:11<00:02, 10.36it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:12<00:02,  7.44it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:12<00:02,  6.21it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:12<00:02,  5.56it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:13<00:02,  6.81it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:13<00:01,  8.07it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:13<00:01,  9.29it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:13<00:00, 10.37it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:13<00:00,  7.43it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:14<00:00,  6.19it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:14<00:00,  5.55it/s]Qunatization Process: 100%|██████████| 561/561 [01:15<00:00,  3.56it/s]Qunatization Process: 100%|██████████| 561/561 [01:15<00:00,  7.39it/s]
+[00:06:11.336608] ## Processing on RANK 1.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:09<00:09,  9.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.42s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.83s/it]
+[00:07:58.087938] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:08:12.078794] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:08:12.333300] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:12.333463] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:08:12.521491] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:12.521657] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:12.759507] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:12.759657] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:12.934916] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:12.935068] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:13.113999] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:13.114151] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:13.304249] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:13.304398] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:13.514058] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:13.514213] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:13.728826] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:13.728980] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:13.958222] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:13.958375] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:14.190472] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:14.190623] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:14.430240] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:14.430393] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:08:14.670119] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:14.670271] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:14.902732] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:14.902883] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.141784] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:15.141942] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.375107] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:15.375285] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.609947] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.610109] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.843713] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:15.843864] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.077970] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.078125] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.311185] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.311347] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.545870] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.546030] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.780150] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:16.780313] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.014106] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:17.014265] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.248494] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:17.248660] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.482784] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.482946] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.716871] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:17.717030] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.950318] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:17.950481] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:18.182915] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:18.183077] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:18.417529] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:18.417691] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:18.651409] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:18.651573] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:18.884060] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:18.884223] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:19.128186] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:19.128365] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:19.370675] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:19.370848] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:19.604437] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:19.604615] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:19.859606] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:19.859778] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:20.114343] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:20.114508] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:20.369187] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:20.369349] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:20.601317] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:20.601495] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:20.834331] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:20.834503] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:21.067210] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:21.067382] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:21.310585] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:21.310748] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:08:21.310955] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:08:21.311092] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:08:36.192348] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:08:36.192577] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:08:36.192711] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:08:36.192846] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:08:50.188476] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:26,  6.48it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:52, 10.59it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:45, 12.24it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:43, 12.82it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<01:09,  7.98it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:21,  6.74it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:28,  6.18it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:11,  7.63it/s]Qunatization Process:   3%|▎         | 17/561 [00:02<01:00,  9.04it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<00:52, 10.28it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:49, 10.90it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<01:07,  7.92it/s]Qunatization Process:   4%|▍         | 25/561 [00:03<01:19,  6.72it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:26,  6.17it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:11,  7.47it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<01:00,  8.79it/s]Qunatization Process:   6%|▌         | 33/561 [00:03<00:52, 10.04it/s]Qunatization Process:   6%|▌         | 35/561 [00:04<00:47, 11.10it/s]Qunatization Process:   7%|▋         | 37/561 [00:04<01:06,  7.86it/s]Qunatization Process:   7%|▋         | 39/561 [00:04<01:17,  6.76it/s]Qunatization Process:   7%|▋         | 41/561 [00:05<01:24,  6.16it/s]Qunatization Process:   8%|▊         | 43/561 [00:05<01:09,  7.45it/s]Qunatization Process:   8%|▊         | 45/561 [00:05<00:58,  8.75it/s]Qunatization Process:   8%|▊         | 47/561 [00:05<00:51,  9.96it/s]Qunatization Process:   9%|▊         | 49/561 [00:05<00:46, 10.91it/s]Qunatization Process:   9%|▉         | 51/561 [00:06<01:03,  8.02it/s]Qunatization Process:   9%|▉         | 53/561 [00:06<01:14,  6.79it/s]Qunatization Process:  10%|▉         | 55/561 [00:07<01:21,  6.21it/s]Qunatization Process:  10%|█         | 57/561 [00:07<01:07,  7.50it/s]Qunatization Process:  11%|█         | 59/561 [00:07<00:57,  8.77it/s]Qunatization Process:  11%|█         | 61/561 [00:07<00:50, 10.00it/s]Qunatization Process:  11%|█         | 63/561 [00:07<00:45, 11.06it/s]Qunatization Process:  12%|█▏        | 65/561 [00:08<01:05,  7.62it/s]Qunatization Process:  12%|█▏        | 67/561 [00:08<01:18,  6.28it/s]Qunatization Process:  12%|█▏        | 69/561 [00:08<01:28,  5.59it/s]Qunatization Process:  13%|█▎        | 71/561 [00:09<01:11,  6.86it/s]Qunatization Process:  13%|█▎        | 73/561 [00:09<01:00,  8.10it/s]Qunatization Process:  13%|█▎        | 75/561 [00:09<00:52,  9.27it/s]Qunatization Process:  14%|█▎        | 77/561 [00:09<00:47, 10.29it/s]Qunatization Process:  14%|█▍        | 79/561 [00:09<01:05,  7.38it/s]Qunatization Process:  14%|█▍        | 81/561 [00:10<01:18,  6.14it/s]Qunatization Process:  15%|█▍        | 83/561 [00:10<01:26,  5.50it/s]Qunatization Process:  15%|█▌        | 85/561 [00:10<01:13,  6.50it/s]Qunatization Process:  16%|█▌        | 87/561 [00:11<01:03,  7.51it/s]Qunatization Process:  16%|█▌        | 89/561 [00:11<00:53,  8.75it/s]Qunatization Process:  16%|█▌        | 91/561 [00:11<00:47,  9.88it/s]Qunatization Process:  17%|█▋        | 93/561 [00:11<01:04,  7.24it/s]Qunatization Process:  17%|█▋        | 95/561 [00:12<01:16,  6.08it/s]Qunatization Process:  17%|█▋        | 97/561 [00:12<01:25,  5.45it/s]Qunatization Process:  18%|█▊        | 99/561 [00:12<01:09,  6.69it/s]Qunatization Process:  18%|█▊        | 101/561 [00:13<00:58,  7.90it/s]Qunatization Process:  18%|█▊        | 103/561 [00:13<00:50,  9.12it/s]Qunatization Process:  19%|█▊        | 105/561 [00:13<00:44, 10.20it/s]Qunatization Process:  19%|█▉        | 107/561 [00:13<01:02,  7.28it/s]Qunatization Process:  19%|█▉        | 109/561 [00:14<01:14,  6.09it/s]Qunatization Process:  20%|█▉        | 111/561 [00:14<01:22,  5.46it/s]Qunatization Process:  20%|██        | 113/561 [00:14<01:07,  6.64it/s]Qunatization Process:  20%|██        | 115/561 [00:15<00:56,  7.92it/s]Qunatization Process:  21%|██        | 117/561 [00:15<00:49,  9.05it/s]Qunatization Process:  21%|██        | 119/561 [00:15<00:43, 10.15it/s]Qunatization Process:  22%|██▏       | 121/561 [00:15<01:00,  7.33it/s]Qunatization Process:  22%|██▏       | 123/561 [00:16<01:11,  6.11it/s]Qunatization Process:  22%|██▏       | 125/561 [00:16<01:19,  5.49it/s]Qunatization Process:  23%|██▎       | 127/561 [00:16<01:04,  6.70it/s]Qunatization Process:  23%|██▎       | 129/561 [00:16<00:54,  7.96it/s]Qunatization Process:  23%|██▎       | 131/561 [00:17<00:46,  9.16it/s]Qunatization Process:  24%|██▎       | 133/561 [00:17<00:41, 10.25it/s]Qunatization Process:  24%|██▍       | 135/561 [00:17<00:58,  7.33it/s]Qunatization Process:  24%|██▍       | 137/561 [00:18<01:09,  6.13it/s]Qunatization Process:  25%|██▍       | 139/561 [00:18<01:16,  5.49it/s]Qunatization Process:  25%|██▌       | 141/561 [00:18<01:02,  6.72it/s]Qunatization Process:  25%|██▌       | 143/561 [00:18<00:52,  7.99it/s]Qunatization Process:  26%|██▌       | 145/561 [00:19<00:45,  9.18it/s]Qunatization Process:  26%|██▌       | 147/561 [00:19<00:40, 10.29it/s]Qunatization Process:  27%|██▋       | 149/561 [00:19<00:55,  7.37it/s]Qunatization Process:  27%|██▋       | 151/561 [00:20<01:06,  6.16it/s]Qunatization Process:  27%|██▋       | 153/561 [00:20<01:13,  5.52it/s]Qunatization Process:  28%|██▊       | 155/561 [00:20<01:00,  6.74it/s]Qunatization Process:  28%|██▊       | 157/561 [00:20<00:50,  8.01it/s]Qunatization Process:  28%|██▊       | 159/561 [00:20<00:43,  9.21it/s]Qunatization Process:  29%|██▊       | 161/561 [00:21<00:38, 10.26it/s]Qunatization Process:  29%|██▉       | 163/561 [00:21<00:54,  7.37it/s]Qunatization Process:  29%|██▉       | 165/561 [00:21<01:04,  6.15it/s]Qunatization Process:  30%|██▉       | 167/561 [00:22<01:11,  5.51it/s]Qunatization Process:  30%|███       | 169/561 [00:22<00:58,  6.74it/s]Qunatization Process:  30%|███       | 171/561 [00:22<00:48,  7.98it/s]Qunatization Process:  31%|███       | 173/561 [00:22<00:42,  9.07it/s]Qunatization Process:  31%|███       | 175/561 [00:23<00:37, 10.17it/s]Qunatization Process:  32%|███▏      | 177/561 [00:23<00:52,  7.36it/s]Qunatization Process:  32%|███▏      | 179/561 [00:23<01:02,  6.14it/s]Qunatization Process:  32%|███▏      | 181/561 [00:24<01:09,  5.51it/s]Qunatization Process:  33%|███▎      | 183/561 [00:24<00:56,  6.70it/s]Qunatization Process:  33%|███▎      | 185/561 [00:24<00:47,  7.97it/s]Qunatization Process:  33%|███▎      | 187/561 [00:24<00:40,  9.16it/s]Qunatization Process:  34%|███▎      | 189/561 [00:24<00:36, 10.21it/s]Qunatization Process:  34%|███▍      | 191/561 [00:25<00:50,  7.35it/s]Qunatization Process:  34%|███▍      | 193/561 [00:25<00:59,  6.16it/s]Qunatization Process:  35%|███▍      | 195/561 [00:26<01:06,  5.51it/s]Qunatization Process:  35%|███▌      | 197/561 [00:26<00:54,  6.74it/s]Qunatization Process:  35%|███▌      | 199/561 [00:26<00:45,  7.96it/s]Qunatization Process:  36%|███▌      | 201/561 [00:26<00:39,  9.18it/s]Qunatization Process:  36%|███▌      | 203/561 [00:26<00:34, 10.29it/s]Qunatization Process:  37%|███▋      | 205/561 [00:27<00:48,  7.37it/s]Qunatization Process:  37%|███▋      | 207/561 [00:27<00:57,  6.14it/s]Qunatization Process:  37%|███▋      | 209/561 [00:28<01:03,  5.50it/s]Qunatization Process:  38%|███▊      | 211/561 [00:28<00:51,  6.74it/s]Qunatization Process:  38%|███▊      | 213/561 [00:28<00:43,  8.01it/s]Qunatization Process:  38%|███▊      | 215/561 [00:28<00:37,  9.20it/s]Qunatization Process:  39%|███▊      | 217/561 [00:28<00:33, 10.29it/s]Qunatization Process:  39%|███▉      | 219/561 [00:29<00:46,  7.41it/s]Qunatization Process:  39%|███▉      | 221/561 [00:29<00:54,  6.19it/s]Qunatization Process:  40%|███▉      | 223/561 [00:30<01:00,  5.55it/s]Qunatization Process:  40%|████      | 225/561 [00:30<00:49,  6.78it/s]Qunatization Process:  40%|████      | 227/561 [00:30<00:41,  8.04it/s]Qunatization Process:  41%|████      | 229/561 [00:30<00:35,  9.26it/s]Qunatization Process:  41%|████      | 231/561 [00:30<00:31, 10.34it/s]Qunatization Process:  42%|████▏     | 233/561 [00:31<00:44,  7.42it/s]Qunatization Process:  42%|████▏     | 235/561 [00:31<00:52,  6.18it/s]Qunatization Process:  42%|████▏     | 237/561 [00:32<00:58,  5.54it/s]Qunatization Process:  43%|████▎     | 239/561 [00:32<00:47,  6.78it/s]Qunatization Process:  43%|████▎     | 241/561 [00:32<00:39,  8.02it/s]Qunatization Process:  43%|████▎     | 243/561 [00:32<00:34,  9.25it/s]Qunatization Process:  44%|████▎     | 245/561 [00:32<00:30, 10.35it/s]Qunatization Process:  44%|████▍     | 247/561 [00:33<00:42,  7.42it/s]Qunatization Process:  44%|████▍     | 249/561 [00:33<00:50,  6.18it/s]Qunatization Process:  45%|████▍     | 251/561 [00:33<00:56,  5.53it/s]Qunatization Process:  45%|████▌     | 253/561 [00:34<00:45,  6.75it/s]Qunatization Process:  45%|████▌     | 255/561 [00:34<00:38,  8.01it/s]Qunatization Process:  46%|████▌     | 257/561 [00:34<00:32,  9.21it/s]Qunatization Process:  46%|████▌     | 259/561 [00:34<00:29, 10.30it/s]Qunatization Process:  47%|████▋     | 261/561 [00:34<00:40,  7.40it/s]Qunatization Process:  47%|████▋     | 263/561 [00:35<00:48,  6.14it/s]Qunatization Process:  47%|████▋     | 265/561 [00:35<00:53,  5.51it/s]Qunatization Process:  48%|████▊     | 267/561 [00:35<00:43,  6.74it/s]Qunatization Process:  48%|████▊     | 269/561 [00:36<00:36,  7.99it/s]Qunatization Process:  48%|████▊     | 271/561 [00:36<00:31,  9.19it/s]Qunatization Process:  49%|████▊     | 273/561 [00:36<00:27, 10.30it/s]Qunatization Process:  49%|████▉     | 275/561 [00:36<00:38,  7.40it/s]Qunatization Process:  49%|████▉     | 277/561 [00:37<00:46,  6.16it/s]Qunatization Process:  50%|████▉     | 279/561 [00:37<00:50,  5.54it/s]Qunatization Process:  50%|█████     | 281/561 [00:37<00:41,  6.77it/s]Qunatization Process:  50%|█████     | 283/561 [00:38<00:34,  8.05it/s]Qunatization Process:  51%|█████     | 285/561 [00:38<00:29,  9.27it/s]Qunatization Process:  51%|█████     | 287/561 [00:38<00:26, 10.37it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:38<00:36,  7.44it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:39<00:43,  6.20it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:39<00:48,  5.56it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:39<00:39,  6.79it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:39<00:32,  8.03it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:40<00:28,  9.23it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:40<00:25, 10.34it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:40<00:34,  7.42it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:41<00:41,  6.19it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:41<00:45,  5.54it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:41<00:37,  6.79it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:41<00:31,  8.06it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:41<00:26,  9.27it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:42<00:23, 10.34it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:42<00:32,  7.42it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:43<00:39,  6.19it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:43<00:43,  5.55it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:43<00:35,  6.69it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:43<00:29,  7.96it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:43<00:25,  9.15it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:44<00:22, 10.27it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:44<00:31,  7.39it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:44<00:37,  6.16it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:45<00:40,  5.52it/s]Qunatization Process:  60%|██████    | 337/561 [00:45<00:33,  6.75it/s]Qunatization Process:  60%|██████    | 339/561 [00:45<00:27,  8.02it/s]Qunatization Process:  61%|██████    | 341/561 [00:45<00:23,  9.25it/s]Qunatization Process:  61%|██████    | 343/561 [00:45<00:21, 10.27it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:46<00:29,  7.41it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:46<00:34,  6.16it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:47<00:38,  5.51it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:47<00:31,  6.75it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:47<00:25,  8.02it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:47<00:22,  9.26it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:47<00:19, 10.33it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:48<00:27,  7.42it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:48<00:32,  6.19it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:49<00:35,  5.55it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:49<00:28,  6.79it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:49<00:24,  8.04it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:49<00:20,  9.25it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:49<00:18, 10.34it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:50<00:25,  7.42it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:50<00:30,  6.18it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:51<00:33,  5.54it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:51<00:26,  6.77it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:51<00:22,  8.05it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:51<00:19,  9.24it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:51<00:17, 10.35it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:52<00:23,  7.42it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:52<00:27,  6.18it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:53<00:30,  5.55it/s]Qunatization Process:  70%|███████   | 393/561 [00:53<00:24,  6.77it/s]Qunatization Process:  70%|███████   | 395/561 [00:53<00:20,  8.02it/s]Qunatization Process:  71%|███████   | 397/561 [00:53<00:17,  9.24it/s]Qunatization Process:  71%|███████   | 399/561 [00:53<00:15, 10.37it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:54<00:21,  7.42it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:54<00:25,  6.19it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:54<00:28,  5.55it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:55<00:22,  6.79it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:55<00:18,  8.04it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:55<00:16,  9.26it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:55<00:14, 10.37it/s]Qunatization Process:  74%|███████▍  | 415/561 [00:55<00:19,  7.45it/s]Qunatization Process:  74%|███████▍  | 417/561 [00:56<00:23,  6.20it/s]Qunatization Process:  75%|███████▍  | 419/561 [00:56<00:25,  5.55it/s]Qunatization Process:  75%|███████▌  | 421/561 [00:56<00:20,  6.78it/s]Qunatization Process:  75%|███████▌  | 423/561 [00:57<00:17,  8.05it/s]Qunatization Process:  76%|███████▌  | 425/561 [00:57<00:14,  9.22it/s]Qunatization Process:  76%|███████▌  | 427/561 [00:57<00:13, 10.31it/s]Qunatization Process:  76%|███████▋  | 429/561 [00:57<00:17,  7.41it/s]Qunatization Process:  77%|███████▋  | 431/561 [00:58<00:21,  6.18it/s]Qunatization Process:  77%|███████▋  | 433/561 [00:58<00:23,  5.55it/s]Qunatization Process:  78%|███████▊  | 435/561 [00:58<00:18,  6.77it/s]Qunatization Process:  78%|███████▊  | 437/561 [00:59<00:15,  8.05it/s]Qunatization Process:  78%|███████▊  | 439/561 [00:59<00:13,  9.25it/s]Qunatization Process:  79%|███████▊  | 441/561 [00:59<00:11, 10.34it/s]Qunatization Process:  79%|███████▉  | 443/561 [00:59<00:15,  7.42it/s]Qunatization Process:  79%|███████▉  | 445/561 [01:00<00:18,  6.19it/s]Qunatization Process:  80%|███████▉  | 447/561 [01:00<00:20,  5.52it/s]Qunatization Process:  80%|████████  | 449/561 [01:00<00:16,  6.75it/s]Qunatization Process:  80%|████████  | 451/561 [01:00<00:13,  7.99it/s]Qunatization Process:  81%|████████  | 453/561 [01:01<00:11,  9.11it/s]Qunatization Process:  81%|████████  | 455/561 [01:01<00:10, 10.21it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:01<00:14,  7.37it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:02<00:16,  6.13it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:02<00:18,  5.49it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:02<00:14,  6.72it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:02<00:12,  8.00it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:03<00:10,  9.23it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:03<00:08, 10.32it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:03<00:12,  7.42it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:04<00:14,  6.19it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:04<00:15,  5.53it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:04<00:12,  6.76it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:04<00:10,  8.02it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:04<00:08,  9.24it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:05<00:07, 10.34it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:05<00:10,  7.42it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:05<00:11,  6.20it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:06<00:12,  5.56it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:06<00:10,  6.79it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:06<00:08,  8.04it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:06<00:07,  9.22it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:06<00:06, 10.32it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:07<00:08,  7.41it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:07<00:09,  6.20it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:08<00:10,  5.55it/s]Qunatization Process:  90%|█████████ | 505/561 [01:08<00:08,  6.78it/s]Qunatization Process:  90%|█████████ | 507/561 [01:08<00:06,  8.04it/s]Qunatization Process:  91%|█████████ | 509/561 [01:08<00:05,  9.27it/s]Qunatization Process:  91%|█████████ | 511/561 [01:08<00:04, 10.35it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:09<00:06,  7.42it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:09<00:07,  6.20it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:10<00:07,  5.56it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:10<00:06,  6.71it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:10<00:05,  7.96it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:10<00:04,  9.18it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:10<00:03, 10.28it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:11<00:04,  7.40it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:11<00:05,  6.16it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:12<00:05,  5.53it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:12<00:04,  6.76it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:12<00:03,  8.02it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:12<00:02,  9.24it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:12<00:02, 10.35it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:13<00:02,  7.44it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:13<00:02,  6.21it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:14<00:02,  5.56it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:14<00:02,  6.80it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:14<00:01,  8.06it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:14<00:01,  9.29it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:14<00:00, 10.39it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:15<00:00,  7.44it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:15<00:00,  6.21it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:15<00:00,  5.56it/s]Qunatization Process: 100%|██████████| 561/561 [01:16<00:00,  3.56it/s]Qunatization Process: 100%|██████████| 561/561 [01:16<00:00,  7.29it/s]
+[00:10:26.414554] ## Processing on RANK 2.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:08<00:08,  8.90s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.33s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.72s/it]
+[00:12:12.759913] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:12:26.694843] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:12:26.951693] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:26.951863] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:12:27.187910] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:27.188063] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.375163] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.375319] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.553587] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.553741] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.744560] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:27.744709] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:27.923575] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:27.923726] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:28.145649] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:28.145800] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:28.388540] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:28.388693] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:28.620691] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:28.620843] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:28.852794] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:28.852947] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:29.084910] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:29.085062] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:12:29.317011] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:29.317163] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:29.551111] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:29.551281] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:29.783336] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:29.783490] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.015062] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:30.015213] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.253015] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.253171] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.483480] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.483635] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.721415] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.721578] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.952988] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:30.953155] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.183903] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.184074] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.413822] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.413974] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.645962] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:31.646125] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:31.877408] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:31.877566] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.109131] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.109298] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.339104] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:32.339256] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.570318] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.570474] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.801735] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:32.801886] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:33.043702] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:33.043871] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:33.285079] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:33.285245] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:33.513983] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:33.514142] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:33.742986] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:33.743139] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:33.972247] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:33.972437] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:34.163217] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:34.163411] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:34.366961] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:34.367134] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:34.571555] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:34.571730] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:34.766503] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:34.766673] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:34.954333] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:34.954505] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:35.142123] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:35.142296] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:35.329050] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:35.329217] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:35.516982] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:35.517150] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:12:35.517345] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:12:35.517476] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:12:50.253312] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:12:50.253537] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:12:50.253668] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:12:50.253801] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:13:04.592471] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:38,  5.71it/s]Qunatization Process:   1%|          | 3/561 [00:00<01:10,  7.94it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:55,  9.99it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:52, 10.55it/s]Qunatization Process:   2%|▏         | 9/561 [00:01<01:33,  5.92it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:43,  5.32it/s]Qunatization Process:   2%|▏         | 13/561 [00:02<01:47,  5.08it/s]Qunatization Process:   3%|▎         | 15/561 [00:02<01:27,  6.24it/s]Qunatization Process:   3%|▎         | 17/561 [00:02<01:12,  7.46it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<01:02,  8.61it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:56,  9.62it/s]Qunatization Process:   4%|▍         | 23/561 [00:03<01:15,  7.13it/s]Qunatization Process:   4%|▍         | 25/561 [00:03<01:27,  6.12it/s]Qunatization Process:   5%|▍         | 27/561 [00:04<01:36,  5.52it/s]Qunatization Process:   5%|▌         | 29/561 [00:04<01:19,  6.68it/s]Qunatization Process:   6%|▌         | 31/561 [00:04<01:05,  8.06it/s]Qunatization Process:   6%|▌         | 33/561 [00:04<00:55,  9.47it/s]Qunatization Process:   6%|▌         | 35/561 [00:04<00:48, 10.75it/s]Qunatization Process:   7%|▋         | 37/561 [00:05<01:02,  8.43it/s]Qunatization Process:   7%|▋         | 39/561 [00:05<01:15,  6.91it/s]Qunatization Process:   7%|▋         | 41/561 [00:05<01:26,  6.02it/s]Qunatization Process:   8%|▊         | 43/561 [00:06<01:11,  7.28it/s]Qunatization Process:   8%|▊         | 45/561 [00:06<00:59,  8.70it/s]Qunatization Process:   8%|▊         | 47/561 [00:06<00:51, 10.07it/s]Qunatization Process:   9%|▊         | 49/561 [00:06<00:45, 11.29it/s]Qunatization Process:   9%|▉         | 51/561 [00:06<01:04,  7.86it/s]Qunatization Process:   9%|▉         | 53/561 [00:07<01:18,  6.50it/s]Qunatization Process:  10%|▉         | 55/561 [00:07<01:27,  5.77it/s]Qunatization Process:  10%|█         | 57/561 [00:07<01:13,  6.84it/s]Qunatization Process:  11%|█         | 59/561 [00:08<01:03,  7.89it/s]Qunatization Process:  11%|█         | 61/561 [00:08<00:56,  8.82it/s]Qunatization Process:  11%|█         | 63/561 [00:08<00:51,  9.59it/s]Qunatization Process:  12%|█▏        | 65/561 [00:08<01:14,  6.67it/s]Qunatization Process:  12%|█▏        | 67/561 [00:09<01:29,  5.51it/s]Qunatization Process:  12%|█▏        | 69/561 [00:09<01:40,  4.91it/s]Qunatization Process:  13%|█▎        | 71/561 [00:10<01:21,  6.00it/s]Qunatization Process:  13%|█▎        | 73/561 [00:10<01:08,  7.09it/s]Qunatization Process:  13%|█▎        | 75/561 [00:10<00:59,  8.11it/s]Qunatization Process:  14%|█▎        | 77/561 [00:10<00:53,  9.01it/s]Qunatization Process:  14%|█▍        | 79/561 [00:10<01:10,  6.83it/s]Qunatization Process:  14%|█▍        | 81/561 [00:11<01:21,  5.85it/s]Qunatization Process:  15%|█▍        | 83/561 [00:11<01:29,  5.33it/s]Qunatization Process:  15%|█▌        | 85/561 [00:12<01:12,  6.53it/s]Qunatization Process:  16%|█▌        | 87/561 [00:12<01:01,  7.73it/s]Qunatization Process:  16%|█▌        | 89/561 [00:12<00:52,  8.91it/s]Qunatization Process:  16%|█▌        | 91/561 [00:12<00:47,  9.82it/s]Qunatization Process:  17%|█▋        | 93/561 [00:12<01:05,  7.20it/s]Qunatization Process:  17%|█▋        | 95/561 [00:13<01:16,  6.06it/s]Qunatization Process:  17%|█▋        | 97/561 [00:13<01:25,  5.44it/s]Qunatization Process:  18%|█▊        | 99/561 [00:14<01:10,  6.55it/s]Qunatization Process:  18%|█▊        | 101/561 [00:14<01:00,  7.61it/s]Qunatization Process:  18%|█▊        | 103/561 [00:14<00:53,  8.50it/s]Qunatization Process:  19%|█▊        | 105/561 [00:14<00:48,  9.38it/s]Qunatization Process:  19%|█▉        | 107/561 [00:15<01:09,  6.58it/s]Qunatization Process:  19%|█▉        | 109/561 [00:15<01:23,  5.39it/s]Qunatization Process:  20%|█▉        | 111/561 [00:16<01:33,  4.82it/s]Qunatization Process:  20%|██        | 113/561 [00:16<01:16,  5.88it/s]Qunatization Process:  20%|██        | 115/561 [00:16<01:04,  6.95it/s]Qunatization Process:  21%|██        | 117/561 [00:16<00:55,  7.95it/s]Qunatization Process:  21%|██        | 119/561 [00:16<00:49,  8.85it/s]Qunatization Process:  22%|██▏       | 121/561 [00:17<01:08,  6.41it/s]Qunatization Process:  22%|██▏       | 123/561 [00:17<01:18,  5.59it/s]Qunatization Process:  22%|██▏       | 125/561 [00:18<01:24,  5.19it/s]Qunatization Process:  23%|██▎       | 127/561 [00:18<01:07,  6.40it/s]Qunatization Process:  23%|██▎       | 129/561 [00:18<00:57,  7.47it/s]Qunatization Process:  23%|██▎       | 131/561 [00:18<00:50,  8.47it/s]Qunatization Process:  24%|██▎       | 133/561 [00:18<00:45,  9.35it/s]Qunatization Process:  24%|██▍       | 135/561 [00:19<01:04,  6.65it/s]Qunatization Process:  24%|██▍       | 137/561 [00:19<01:16,  5.52it/s]Qunatization Process:  25%|██▍       | 139/561 [00:20<01:25,  4.91it/s]Qunatization Process:  25%|██▌       | 141/561 [00:20<01:11,  5.89it/s]Qunatization Process:  25%|██▌       | 143/561 [00:20<00:59,  6.99it/s]Qunatization Process:  26%|██▌       | 145/561 [00:20<00:52,  7.98it/s]Qunatization Process:  26%|██▌       | 147/561 [00:20<00:46,  8.91it/s]Qunatization Process:  27%|██▋       | 149/561 [00:21<01:04,  6.42it/s]Qunatization Process:  27%|██▋       | 151/561 [00:21<01:15,  5.43it/s]Qunatization Process:  27%|██▋       | 153/561 [00:22<01:22,  4.96it/s]Qunatization Process:  28%|██▊       | 155/561 [00:22<01:07,  6.03it/s]Qunatization Process:  28%|██▊       | 157/561 [00:22<00:57,  7.08it/s]Qunatization Process:  28%|██▊       | 159/561 [00:22<00:49,  8.13it/s]Qunatization Process:  29%|██▊       | 161/561 [00:23<00:44,  8.93it/s]Qunatization Process:  29%|██▉       | 163/561 [00:23<01:01,  6.45it/s]Qunatization Process:  29%|██▉       | 165/561 [00:24<01:13,  5.39it/s]Qunatization Process:  30%|██▉       | 167/561 [00:24<01:21,  4.84it/s]Qunatization Process:  30%|███       | 169/561 [00:24<01:06,  5.90it/s]Qunatization Process:  30%|███       | 171/561 [00:25<00:55,  6.98it/s]Qunatization Process:  31%|███       | 173/561 [00:25<00:48,  7.99it/s]Qunatization Process:  31%|███       | 175/561 [00:25<00:43,  8.82it/s]Qunatization Process:  32%|███▏      | 177/561 [00:25<01:00,  6.38it/s]Qunatization Process:  32%|███▏      | 179/561 [00:26<01:11,  5.36it/s]Qunatization Process:  32%|███▏      | 181/561 [00:26<01:18,  4.83it/s]Qunatization Process:  33%|███▎      | 183/561 [00:27<01:04,  5.86it/s]Qunatization Process:  33%|███▎      | 185/561 [00:27<00:54,  6.96it/s]Qunatization Process:  33%|███▎      | 187/561 [00:27<00:46,  8.00it/s]Qunatization Process:  34%|███▎      | 189/561 [00:27<00:41,  8.94it/s]Qunatization Process:  34%|███▍      | 191/561 [00:28<00:57,  6.45it/s]Qunatization Process:  34%|███▍      | 193/561 [00:28<01:07,  5.42it/s]Qunatization Process:  35%|███▍      | 195/561 [00:29<01:15,  4.87it/s]Qunatization Process:  35%|███▌      | 197/561 [00:29<01:00,  6.05it/s]Qunatization Process:  35%|███▌      | 199/561 [00:29<00:49,  7.28it/s]Qunatization Process:  36%|███▌      | 201/561 [00:29<00:42,  8.49it/s]Qunatization Process:  36%|███▌      | 203/561 [00:29<00:37,  9.64it/s]Qunatization Process:  37%|███▋      | 205/561 [00:30<00:50,  7.11it/s]Qunatization Process:  37%|███▋      | 207/561 [00:30<00:58,  6.03it/s]Qunatization Process:  37%|███▋      | 209/561 [00:31<01:04,  5.45it/s]Qunatization Process:  38%|███▊      | 211/561 [00:31<00:52,  6.67it/s]Qunatization Process:  38%|███▊      | 213/561 [00:31<00:43,  7.92it/s]Qunatization Process:  38%|███▊      | 215/561 [00:31<00:37,  9.12it/s]Qunatization Process:  39%|███▊      | 217/561 [00:31<00:33, 10.20it/s]Qunatization Process:  39%|███▉      | 219/561 [00:32<00:46,  7.33it/s]Qunatization Process:  39%|███▉      | 221/561 [00:32<00:55,  6.13it/s]Qunatization Process:  40%|███▉      | 223/561 [00:32<01:01,  5.46it/s]Qunatization Process:  40%|████      | 225/561 [00:33<00:52,  6.43it/s]Qunatization Process:  40%|████      | 227/561 [00:33<00:44,  7.48it/s]Qunatization Process:  41%|████      | 229/561 [00:33<00:39,  8.48it/s]Qunatization Process:  41%|████      | 231/561 [00:33<00:35,  9.30it/s]Qunatization Process:  42%|████▏     | 233/561 [00:34<00:46,  6.99it/s]Qunatization Process:  42%|████▏     | 235/561 [00:34<00:54,  5.95it/s]Qunatization Process:  42%|████▏     | 237/561 [00:34<01:00,  5.39it/s]Qunatization Process:  43%|████▎     | 239/561 [00:35<00:48,  6.61it/s]Qunatization Process:  43%|████▎     | 241/561 [00:35<00:40,  7.86it/s]Qunatization Process:  43%|████▎     | 243/561 [00:35<00:35,  9.07it/s]Qunatization Process:  44%|████▎     | 245/561 [00:35<00:31, 10.05it/s]Qunatization Process:  44%|████▍     | 247/561 [00:36<00:43,  7.22it/s]Qunatization Process:  44%|████▍     | 249/561 [00:36<00:51,  6.07it/s]Qunatization Process:  45%|████▍     | 251/561 [00:36<00:57,  5.44it/s]Qunatization Process:  45%|████▌     | 253/561 [00:37<00:46,  6.65it/s]Qunatization Process:  45%|████▌     | 255/561 [00:37<00:38,  7.89it/s]Qunatization Process:  46%|████▌     | 257/561 [00:37<00:33,  9.11it/s]Qunatization Process:  46%|████▌     | 259/561 [00:37<00:29, 10.17it/s]Qunatization Process:  47%|████▋     | 261/561 [00:37<00:41,  7.30it/s]Qunatization Process:  47%|████▋     | 263/561 [00:38<00:48,  6.10it/s]Qunatization Process:  47%|████▋     | 265/561 [00:38<00:54,  5.47it/s]Qunatization Process:  48%|████▊     | 267/561 [00:38<00:43,  6.70it/s]Qunatization Process:  48%|████▊     | 269/561 [00:39<00:36,  7.93it/s]Qunatization Process:  48%|████▊     | 271/561 [00:39<00:31,  9.12it/s]Qunatization Process:  49%|████▊     | 273/561 [00:39<00:28, 10.22it/s]Qunatization Process:  49%|████▉     | 275/561 [00:39<00:39,  7.33it/s]Qunatization Process:  49%|████▉     | 277/561 [00:40<00:46,  6.12it/s]Qunatization Process:  50%|████▉     | 279/561 [00:40<00:51,  5.48it/s]Qunatization Process:  50%|█████     | 281/561 [00:40<00:41,  6.70it/s]Qunatization Process:  50%|█████     | 283/561 [00:41<00:36,  7.70it/s]Qunatization Process:  51%|█████     | 285/561 [00:41<00:32,  8.58it/s]Qunatization Process:  51%|█████     | 287/561 [00:41<00:29,  9.37it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:41<00:40,  6.71it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:42<00:49,  5.49it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:42<00:54,  4.89it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:43<00:44,  5.95it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:43<00:37,  6.98it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:43<00:32,  8.02it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:43<00:29,  8.93it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:44<00:39,  6.46it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:44<00:47,  5.43it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:45<00:52,  4.87it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:45<00:42,  5.93it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:45<00:35,  7.00it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:45<00:30,  8.02it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:45<00:27,  8.96it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:46<00:37,  6.46it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:46<00:45,  5.37it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:47<00:49,  4.80it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:47<00:40,  5.81it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:47<00:34,  6.84it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:47<00:29,  7.89it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:48<00:26,  8.80it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:48<00:35,  6.42it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:49<00:42,  5.37it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:49<00:46,  4.84it/s]Qunatization Process:  60%|██████    | 337/561 [00:49<00:37,  5.91it/s]Qunatization Process:  60%|██████    | 339/561 [00:49<00:31,  6.99it/s]Qunatization Process:  61%|██████    | 341/561 [00:50<00:27,  7.99it/s]Qunatization Process:  61%|██████    | 343/561 [00:50<00:24,  8.92it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:50<00:33,  6.47it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:51<00:39,  5.40it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:51<00:44,  4.81it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:51<00:36,  5.82it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:52<00:30,  6.90it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:52<00:25,  7.95it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:52<00:23,  8.77it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:52<00:31,  6.37it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:53<00:37,  5.35it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:53<00:41,  4.83it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:54<00:33,  5.88it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:54<00:27,  6.99it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:54<00:23,  8.03it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:54<00:21,  8.96it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:55<00:29,  6.44it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:55<00:34,  5.40it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:56<00:37,  4.85it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:56<00:30,  5.93it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:56<00:25,  6.99it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:56<00:22,  8.03it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:56<00:19,  8.95it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:57<00:26,  6.49it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:57<00:31,  5.43it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:58<00:34,  4.87it/s]Qunatization Process:  70%|███████   | 393/561 [00:58<00:28,  5.95it/s]Qunatization Process:  70%|███████   | 395/561 [00:58<00:23,  7.03it/s]Qunatization Process:  71%|███████   | 397/561 [00:58<00:20,  8.06it/s]Qunatization Process:  71%|███████   | 399/561 [00:58<00:18,  8.98it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:59<00:23,  6.82it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:59<00:26,  5.88it/s]Qunatization Process:  72%|███████▏  | 405/561 [01:00<00:29,  5.33it/s]Qunatization Process:  73%|███████▎  | 407/561 [01:00<00:23,  6.46it/s]Qunatization Process:  73%|███████▎  | 409/561 [01:00<00:19,  7.68it/s]Qunatization Process:  73%|███████▎  | 411/561 [01:00<00:16,  8.86it/s]Qunatization Process:  74%|███████▎  | 413/561 [01:00<00:14,  9.96it/s]Qunatization Process:  74%|███████▍  | 415/561 [01:01<00:20,  6.96it/s]Qunatization Process:  74%|███████▍  | 417/561 [01:01<00:25,  5.61it/s]Qunatization Process:  75%|███████▍  | 419/561 [01:02<00:28,  4.98it/s]Qunatization Process:  75%|███████▌  | 421/561 [01:02<00:23,  6.03it/s]Qunatization Process:  75%|███████▌  | 423/561 [01:02<00:19,  7.10it/s]Qunatization Process:  76%|███████▌  | 425/561 [01:02<00:16,  8.08it/s]Qunatization Process:  76%|███████▌  | 427/561 [01:03<00:14,  8.97it/s]Qunatization Process:  76%|███████▋  | 429/561 [01:03<00:20,  6.40it/s]Qunatization Process:  77%|███████▋  | 431/561 [01:04<00:24,  5.34it/s]Qunatization Process:  77%|███████▋  | 433/561 [01:04<00:26,  4.76it/s]Qunatization Process:  78%|███████▊  | 435/561 [01:04<00:21,  5.84it/s]Qunatization Process:  78%|███████▊  | 437/561 [01:05<00:18,  6.85it/s]Qunatization Process:  78%|███████▊  | 439/561 [01:05<00:15,  7.88it/s]Qunatization Process:  79%|███████▊  | 441/561 [01:05<00:13,  8.70it/s]Qunatization Process:  79%|███████▉  | 443/561 [01:05<00:18,  6.37it/s]Qunatization Process:  79%|███████▉  | 445/561 [01:06<00:21,  5.34it/s]Qunatization Process:  80%|███████▉  | 447/561 [01:06<00:23,  4.81it/s]Qunatization Process:  80%|████████  | 449/561 [01:07<00:19,  5.88it/s]Qunatization Process:  80%|████████  | 451/561 [01:07<00:15,  6.95it/s]Qunatization Process:  81%|████████  | 453/561 [01:07<00:13,  7.98it/s]Qunatization Process:  81%|████████  | 455/561 [01:07<00:11,  8.91it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:08<00:16,  6.46it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:08<00:18,  5.40it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:09<00:20,  4.83it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:09<00:16,  5.81it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:09<00:13,  6.89it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:09<00:11,  7.89it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:09<00:10,  8.82it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:10<00:14,  6.40it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:10<00:16,  5.37it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:11<00:17,  5.04it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:11<00:13,  6.23it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:11<00:10,  7.48it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:11<00:09,  8.70it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:11<00:07,  9.79it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:12<00:11,  6.69it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:12<00:13,  5.38it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:13<00:14,  5.01it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:13<00:11,  6.18it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:13<00:09,  7.23it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:13<00:07,  8.47it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:13<00:06,  9.24it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:14<00:09,  6.59it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:14<00:10,  5.72it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:15<00:11,  5.22it/s]Qunatization Process:  90%|█████████ | 505/561 [01:15<00:08,  6.42it/s]Qunatization Process:  90%|█████████ | 507/561 [01:15<00:07,  7.67it/s]Qunatization Process:  91%|█████████ | 509/561 [01:15<00:05,  8.79it/s]Qunatization Process:  91%|█████████ | 511/561 [01:15<00:05,  9.92it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:16<00:06,  7.21it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:16<00:07,  6.06it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:17<00:08,  5.47it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:17<00:06,  6.70it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:17<00:05,  7.94it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:17<00:04,  9.15it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:17<00:03, 10.24it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:18<00:04,  7.26it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:18<00:05,  6.10it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:19<00:05,  5.47it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:19<00:04,  6.56it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:19<00:03,  7.81it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:19<00:02,  9.01it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:19<00:02,  9.89it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:20<00:02,  6.80it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:20<00:03,  5.76it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:21<00:03,  5.23it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:21<00:02,  6.38it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:21<00:01,  7.61it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:21<00:01,  8.71it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:21<00:00,  9.86it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:22<00:00,  7.16it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:22<00:00,  5.92it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:23<00:00,  5.36it/s]Qunatization Process: 100%|██████████| 561/561 [01:24<00:00,  3.43it/s]Qunatization Process: 100%|██████████| 561/561 [01:24<00:00,  6.65it/s]
+[00:14:49.723386] ## Processing on RANK 3.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:07<00:07,  7.22s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.63s/it]
+[00:16:33.714997] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:16:46.212198] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:16:46.496794] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:46.496972] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:16:46.728215] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:46.728381] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:46.923188] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:46.923351] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.108534] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.108694] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.302826] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:47.302986] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.486296] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:47.486463] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.676547] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:47.676714] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:47.887803] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:47.887964] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:48.089470] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:48.089652] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:48.291432] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:48.291596] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:48.491455] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:48.491618] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:16:48.692582] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:48.692739] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:48.903862] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:48.904021] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.118563] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:49.118732] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.331838] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:49.331998] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.546567] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.546730] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.761641] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.761805] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.977212] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:49.977375] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.182179] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.182342] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.397401] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.397582] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.612592] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.612755] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:50.827234] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:50.827392] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.042585] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:51.042745] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.257746] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.257905] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.473167] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:51.473325] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.688423] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.688582] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.903402] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:51.903561] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:52.118724] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:52.118880] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:52.334798] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:52.334959] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:52.549639] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:52.549798] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:52.795645] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:52.795801] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:53.111814] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:53.111976] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:53.378041] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:53.378203] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:53.643654] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:53.643809] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:53.909463] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:53.909629] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:54.154683] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:54.154840] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:54.400434] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:54.400589] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:54.658372] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:54.658531] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:54.894156] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:54.894314] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:55.141070] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:16:55.141228] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:16:55.141396] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:16:55.141521] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:17:09.814374] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:17:09.814615] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:17:09.814747] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:17:09.814876] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:17:24.224410] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:24,  6.63it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:51, 10.82it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:44, 12.45it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:42, 13.03it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<01:06,  8.27it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:18,  7.02it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:25,  6.40it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:09,  7.88it/s]Qunatization Process:   3%|▎         | 17/561 [00:01<00:58,  9.31it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<00:51, 10.54it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:47, 11.25it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<01:05,  8.26it/s]Qunatization Process:   4%|▍         | 25/561 [00:03<01:16,  7.01it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:23,  6.42it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:08,  7.79it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<00:58,  9.13it/s]Qunatization Process:   6%|▌         | 33/561 [00:03<00:50, 10.35it/s]Qunatization Process:   6%|▌         | 35/561 [00:03<00:45, 11.48it/s]Qunatization Process:   7%|▋         | 37/561 [00:04<01:03,  8.24it/s]Qunatization Process:   7%|▋         | 39/561 [00:04<01:13,  7.10it/s]Qunatization Process:   7%|▋         | 41/561 [00:05<01:19,  6.52it/s]Qunatization Process:   8%|▊         | 43/561 [00:05<01:05,  7.86it/s]Qunatization Process:   8%|▊         | 45/561 [00:05<00:55,  9.21it/s]Qunatization Process:   8%|▊         | 47/561 [00:05<00:49, 10.47it/s]Qunatization Process:   9%|▊         | 49/561 [00:05<00:44, 11.50it/s]Qunatization Process:   9%|▉         | 51/561 [00:05<01:00,  8.46it/s]Qunatization Process:   9%|▉         | 53/561 [00:06<01:10,  7.23it/s]Qunatization Process:  10%|▉         | 55/561 [00:06<01:16,  6.57it/s]Qunatization Process:  10%|█         | 57/561 [00:06<01:03,  7.91it/s]Qunatization Process:  11%|█         | 59/561 [00:06<00:54,  9.24it/s]Qunatization Process:  11%|█         | 61/561 [00:07<00:47, 10.44it/s]Qunatization Process:  11%|█         | 63/561 [00:07<00:43, 11.52it/s]Qunatization Process:  12%|█▏        | 65/561 [00:07<01:00,  8.25it/s]Qunatization Process:  12%|█▏        | 67/561 [00:08<01:10,  6.98it/s]Qunatization Process:  12%|█▏        | 69/561 [00:08<01:16,  6.42it/s]Qunatization Process:  13%|█▎        | 71/561 [00:08<01:03,  7.76it/s]Qunatization Process:  13%|█▎        | 73/561 [00:08<00:53,  9.07it/s]Qunatization Process:  13%|█▎        | 75/561 [00:08<00:47, 10.33it/s]Qunatization Process:  14%|█▎        | 77/561 [00:08<00:42, 11.43it/s]Qunatization Process:  14%|█▍        | 79/561 [00:09<00:57,  8.36it/s]Qunatization Process:  14%|█▍        | 81/561 [00:09<01:06,  7.18it/s]Qunatization Process:  15%|█▍        | 83/561 [00:10<01:14,  6.41it/s]Qunatization Process:  15%|█▌        | 85/561 [00:10<01:01,  7.71it/s]Qunatization Process:  16%|█▌        | 87/561 [00:10<00:52,  9.07it/s]Qunatization Process:  16%|█▌        | 89/561 [00:10<00:46, 10.25it/s]Qunatization Process:  16%|█▌        | 91/561 [00:10<00:41, 11.41it/s]Qunatization Process:  17%|█▋        | 93/561 [00:11<00:58,  7.95it/s]Qunatization Process:  17%|█▋        | 95/561 [00:11<01:12,  6.45it/s]Qunatization Process:  17%|█▋        | 97/561 [00:11<01:21,  5.68it/s]Qunatization Process:  18%|█▊        | 99/561 [00:12<01:06,  7.00it/s]Qunatization Process:  18%|█▊        | 101/561 [00:12<00:55,  8.26it/s]Qunatization Process:  18%|█▊        | 103/561 [00:12<00:48,  9.46it/s]Qunatization Process:  19%|█▊        | 105/561 [00:12<00:43, 10.47it/s]Qunatization Process:  19%|█▉        | 107/561 [00:12<01:00,  7.45it/s]Qunatization Process:  19%|█▉        | 109/561 [00:13<01:12,  6.20it/s]Qunatization Process:  20%|█▉        | 111/561 [00:13<01:20,  5.56it/s]Qunatization Process:  20%|██        | 113/561 [00:13<01:05,  6.80it/s]Qunatization Process:  20%|██        | 115/561 [00:14<00:55,  8.05it/s]Qunatization Process:  21%|██        | 117/561 [00:14<00:47,  9.27it/s]Qunatization Process:  21%|██        | 119/561 [00:14<00:42, 10.36it/s]Qunatization Process:  22%|██▏       | 121/561 [00:14<00:59,  7.45it/s]Qunatization Process:  22%|██▏       | 123/561 [00:15<01:10,  6.22it/s]Qunatization Process:  22%|██▏       | 125/561 [00:15<01:18,  5.57it/s]Qunatization Process:  23%|██▎       | 127/561 [00:15<01:03,  6.81it/s]Qunatization Process:  23%|██▎       | 129/561 [00:16<00:53,  8.05it/s]Qunatization Process:  23%|██▎       | 131/561 [00:16<00:46,  9.17it/s]Qunatization Process:  24%|██▎       | 133/561 [00:16<00:42, 10.18it/s]Qunatization Process:  24%|██▍       | 135/561 [00:16<00:57,  7.37it/s]Qunatization Process:  24%|██▍       | 137/561 [00:17<01:08,  6.16it/s]Qunatization Process:  25%|██▍       | 139/561 [00:17<01:16,  5.53it/s]Qunatization Process:  25%|██▌       | 141/561 [00:17<01:02,  6.77it/s]Qunatization Process:  25%|██▌       | 143/561 [00:17<00:52,  8.03it/s]Qunatization Process:  26%|██▌       | 145/561 [00:18<00:44,  9.25it/s]Qunatization Process:  26%|██▌       | 147/561 [00:18<00:39, 10.35it/s]Qunatization Process:  27%|██▋       | 149/561 [00:18<00:55,  7.43it/s]Qunatization Process:  27%|██▋       | 151/561 [00:19<01:06,  6.18it/s]Qunatization Process:  27%|██▋       | 153/561 [00:19<01:13,  5.52it/s]Qunatization Process:  28%|██▊       | 155/561 [00:19<01:00,  6.74it/s]Qunatization Process:  28%|██▊       | 157/561 [00:19<00:50,  7.95it/s]Qunatization Process:  28%|██▊       | 159/561 [00:19<00:44,  9.08it/s]Qunatization Process:  29%|██▊       | 161/561 [00:20<00:39, 10.19it/s]Qunatization Process:  29%|██▉       | 163/561 [00:20<00:54,  7.30it/s]Qunatization Process:  29%|██▉       | 165/561 [00:21<01:05,  6.09it/s]Qunatization Process:  30%|██▉       | 167/561 [00:21<01:11,  5.48it/s]Qunatization Process:  30%|███       | 169/561 [00:21<00:58,  6.70it/s]Qunatization Process:  30%|███       | 171/561 [00:21<00:49,  7.95it/s]Qunatization Process:  31%|███       | 173/561 [00:21<00:42,  9.16it/s]Qunatization Process:  31%|███       | 175/561 [00:22<00:37, 10.27it/s]Qunatization Process:  32%|███▏      | 177/561 [00:22<00:52,  7.38it/s]Qunatization Process:  32%|███▏      | 179/561 [00:22<01:01,  6.18it/s]Qunatization Process:  32%|███▏      | 181/561 [00:23<01:08,  5.55it/s]Qunatization Process:  33%|███▎      | 183/561 [00:23<00:55,  6.78it/s]Qunatization Process:  33%|███▎      | 185/561 [00:23<00:47,  8.00it/s]Qunatization Process:  33%|███▎      | 187/561 [00:23<00:40,  9.22it/s]Qunatization Process:  34%|███▎      | 189/561 [00:23<00:36, 10.29it/s]Qunatization Process:  34%|███▍      | 191/561 [00:24<00:49,  7.41it/s]Qunatization Process:  34%|███▍      | 193/561 [00:24<00:59,  6.19it/s]Qunatization Process:  35%|███▍      | 195/561 [00:25<01:05,  5.56it/s]Qunatization Process:  35%|███▌      | 197/561 [00:25<00:53,  6.80it/s]Qunatization Process:  35%|███▌      | 199/561 [00:25<00:44,  8.07it/s]Qunatization Process:  36%|███▌      | 201/561 [00:25<00:38,  9.30it/s]Qunatization Process:  36%|███▌      | 203/561 [00:25<00:34, 10.34it/s]Qunatization Process:  37%|███▋      | 205/561 [00:26<00:47,  7.43it/s]Qunatization Process:  37%|███▋      | 207/561 [00:26<00:56,  6.21it/s]Qunatization Process:  37%|███▋      | 209/561 [00:27<01:03,  5.53it/s]Qunatization Process:  38%|███▊      | 211/561 [00:27<00:52,  6.68it/s]Qunatization Process:  38%|███▊      | 213/561 [00:27<00:43,  7.91it/s]Qunatization Process:  38%|███▊      | 215/561 [00:27<00:37,  9.13it/s]Qunatization Process:  39%|███▊      | 217/561 [00:27<00:33, 10.18it/s]Qunatization Process:  39%|███▉      | 219/561 [00:28<00:46,  7.32it/s]Qunatization Process:  39%|███▉      | 221/561 [00:28<00:55,  6.12it/s]Qunatization Process:  40%|███▉      | 223/561 [00:29<01:01,  5.47it/s]Qunatization Process:  40%|████      | 225/561 [00:29<00:50,  6.71it/s]Qunatization Process:  40%|████      | 227/561 [00:29<00:41,  7.97it/s]Qunatization Process:  41%|████      | 229/561 [00:29<00:36,  9.18it/s]Qunatization Process:  41%|████      | 231/561 [00:29<00:32, 10.30it/s]Qunatization Process:  42%|████▏     | 233/561 [00:30<00:44,  7.41it/s]Qunatization Process:  42%|████▏     | 235/561 [00:30<00:52,  6.20it/s]Qunatization Process:  42%|████▏     | 237/561 [00:31<00:58,  5.55it/s]Qunatization Process:  43%|████▎     | 239/561 [00:31<00:47,  6.77it/s]Qunatization Process:  43%|████▎     | 241/561 [00:31<00:39,  8.01it/s]Qunatization Process:  43%|████▎     | 243/561 [00:31<00:34,  9.26it/s]Qunatization Process:  44%|████▎     | 245/561 [00:31<00:30, 10.32it/s]Qunatization Process:  44%|████▍     | 247/561 [00:32<00:42,  7.38it/s]Qunatization Process:  44%|████▍     | 249/561 [00:32<00:50,  6.15it/s]Qunatization Process:  45%|████▍     | 251/561 [00:32<00:56,  5.53it/s]Qunatization Process:  45%|████▌     | 253/561 [00:33<00:45,  6.78it/s]Qunatization Process:  45%|████▌     | 255/561 [00:33<00:38,  8.05it/s]Qunatization Process:  46%|████▌     | 257/561 [00:33<00:32,  9.23it/s]Qunatization Process:  46%|████▌     | 259/561 [00:33<00:29, 10.30it/s]Qunatization Process:  47%|████▋     | 261/561 [00:33<00:40,  7.38it/s]Qunatization Process:  47%|████▋     | 263/561 [00:34<00:48,  6.18it/s]Qunatization Process:  47%|████▋     | 265/561 [00:34<00:53,  5.55it/s]Qunatization Process:  48%|████▊     | 267/561 [00:35<00:43,  6.79it/s]Qunatization Process:  48%|████▊     | 269/561 [00:35<00:36,  8.07it/s]Qunatization Process:  48%|████▊     | 271/561 [00:35<00:31,  9.26it/s]Qunatization Process:  49%|████▊     | 273/561 [00:35<00:27, 10.34it/s]Qunatization Process:  49%|████▉     | 275/561 [00:35<00:38,  7.43it/s]Qunatization Process:  49%|████▉     | 277/561 [00:36<00:45,  6.21it/s]Qunatization Process:  50%|████▉     | 279/561 [00:36<00:50,  5.56it/s]Qunatization Process:  50%|█████     | 281/561 [00:36<00:41,  6.79it/s]Qunatization Process:  50%|█████     | 283/561 [00:37<00:34,  8.06it/s]Qunatization Process:  51%|█████     | 285/561 [00:37<00:29,  9.23it/s]Qunatization Process:  51%|█████     | 287/561 [00:37<00:26, 10.33it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:37<00:36,  7.43it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:38<00:43,  6.19it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:38<00:48,  5.55it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:38<00:39,  6.73it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:38<00:33,  7.93it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:39<00:28,  9.14it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:39<00:25, 10.22it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:39<00:34,  7.40it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:40<00:41,  6.17it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:40<00:45,  5.53it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:40<00:37,  6.78it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:40<00:31,  8.04it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:41<00:26,  9.20it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:41<00:23, 10.29it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:41<00:33,  7.39it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:42<00:39,  6.16it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:42<00:43,  5.55it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:42<00:35,  6.75it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:42<00:29,  8.02it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:42<00:25,  9.24it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:43<00:22, 10.33it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:43<00:31,  7.42it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:43<00:36,  6.19it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:44<00:40,  5.56it/s]Qunatization Process:  60%|██████    | 337/561 [00:44<00:32,  6.80it/s]Qunatization Process:  60%|██████    | 339/561 [00:44<00:27,  8.07it/s]Qunatization Process:  61%|██████    | 341/561 [00:44<00:23,  9.25it/s]Qunatization Process:  61%|██████    | 343/561 [00:44<00:21, 10.34it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:45<00:29,  7.42it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:45<00:34,  6.20it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:46<00:38,  5.56it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:46<00:30,  6.79it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:46<00:25,  8.06it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:46<00:22,  9.30it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:46<00:19, 10.38it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:47<00:27,  7.45it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:47<00:32,  6.22it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:48<00:35,  5.56it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:48<00:28,  6.81it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:48<00:24,  8.07it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:48<00:20,  9.28it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:48<00:18, 10.41it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:49<00:25,  7.46it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:49<00:29,  6.21it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:50<00:33,  5.56it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:50<00:26,  6.79it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:50<00:22,  8.06it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:50<00:19,  9.26it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:50<00:16, 10.36it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:51<00:23,  7.40it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:51<00:27,  6.17it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:52<00:30,  5.51it/s]Qunatization Process:  70%|███████   | 393/561 [00:52<00:24,  6.74it/s]Qunatization Process:  70%|███████   | 395/561 [00:52<00:20,  8.01it/s]Qunatization Process:  71%|███████   | 397/561 [00:52<00:17,  9.23it/s]Qunatization Process:  71%|███████   | 399/561 [00:52<00:15, 10.32it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:53<00:21,  7.44it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:53<00:25,  6.22it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:53<00:28,  5.56it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:54<00:22,  6.77it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:54<00:18,  8.04it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:54<00:16,  9.26it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:54<00:14, 10.33it/s]Qunatization Process:  74%|███████▍  | 415/561 [00:54<00:19,  7.43it/s]Qunatization Process:  74%|███████▍  | 417/561 [00:55<00:23,  6.22it/s]Qunatization Process:  75%|███████▍  | 419/561 [00:55<00:25,  5.57it/s]Qunatization Process:  75%|███████▌  | 421/561 [00:55<00:20,  6.80it/s]Qunatization Process:  75%|███████▌  | 423/561 [00:56<00:17,  8.07it/s]Qunatization Process:  76%|███████▌  | 425/561 [00:56<00:14,  9.27it/s]Qunatization Process:  76%|███████▌  | 427/561 [00:56<00:12, 10.36it/s]Qunatization Process:  76%|███████▋  | 429/561 [00:56<00:17,  7.45it/s]Qunatization Process:  77%|███████▋  | 431/561 [00:57<00:20,  6.21it/s]Qunatization Process:  77%|███████▋  | 433/561 [00:57<00:22,  5.58it/s]Qunatization Process:  78%|███████▊  | 435/561 [00:57<00:18,  6.83it/s]Qunatization Process:  78%|███████▊  | 437/561 [00:58<00:15,  8.10it/s]Qunatization Process:  78%|███████▊  | 439/561 [00:58<00:13,  9.28it/s]Qunatization Process:  79%|███████▊  | 441/561 [00:58<00:11, 10.40it/s]Qunatization Process:  79%|███████▉  | 443/561 [00:58<00:15,  7.44it/s]Qunatization Process:  79%|███████▉  | 445/561 [00:59<00:18,  6.22it/s]Qunatization Process:  80%|███████▉  | 447/561 [00:59<00:20,  5.56it/s]Qunatization Process:  80%|████████  | 449/561 [00:59<00:16,  6.82it/s]Qunatization Process:  80%|████████  | 451/561 [00:59<00:13,  8.08it/s]Qunatization Process:  81%|████████  | 453/561 [01:00<00:11,  9.19it/s]Qunatization Process:  81%|████████  | 455/561 [01:00<00:10, 10.30it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:00<00:14,  7.42it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:01<00:16,  6.19it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:01<00:18,  5.55it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:01<00:14,  6.76it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:01<00:11,  8.03it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:01<00:10,  9.17it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:02<00:08, 10.26it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:02<00:12,  7.39it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:03<00:14,  6.20it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:03<00:15,  5.55it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:03<00:12,  6.79it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:03<00:10,  8.04it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:03<00:08,  9.26it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:04<00:07, 10.36it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:04<00:10,  7.44it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:04<00:11,  6.20it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:05<00:12,  5.56it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:05<00:10,  6.80it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:05<00:08,  8.05it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:05<00:07,  9.27it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:05<00:06, 10.33it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:06<00:08,  7.44it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:06<00:09,  6.20it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:07<00:10,  5.56it/s]Qunatization Process:  90%|█████████ | 505/561 [01:07<00:08,  6.80it/s]Qunatization Process:  90%|█████████ | 507/561 [01:07<00:06,  8.06it/s]Qunatization Process:  91%|█████████ | 509/561 [01:07<00:05,  9.27it/s]Qunatization Process:  91%|█████████ | 511/561 [01:07<00:04, 10.35it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:08<00:06,  7.43it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:08<00:07,  6.20it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:09<00:07,  5.56it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:09<00:06,  6.81it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:09<00:04,  8.08it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:09<00:04,  9.27it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:09<00:03, 10.37it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:10<00:04,  7.46it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:10<00:05,  6.22it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:11<00:05,  5.57it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:11<00:04,  6.80it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:11<00:03,  8.07it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:11<00:02,  9.29it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:11<00:02, 10.37it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:12<00:02,  7.46it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:12<00:02,  6.23it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:12<00:02,  5.58it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:13<00:02,  6.82it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:13<00:01,  8.08it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:13<00:01,  9.29it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:13<00:00, 10.38it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:13<00:00,  7.46it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:14<00:00,  6.22it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:14<00:00,  5.57it/s]Qunatization Process: 100%|██████████| 561/561 [01:15<00:00,  3.58it/s]Qunatization Process: 100%|██████████| 561/561 [01:15<00:00,  7.39it/s]
+[00:18:59.047841] ## Processing on RANK 4.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.91s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.38s/it]
+[00:20:43.087282] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:20:54.626288] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:20:54.842649] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:54.842818] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:20:55.015611] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:55.015769] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.189703] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.189857] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.338131] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.338288] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.485155] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:55.485309] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.625261] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:55.625414] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:55.829381] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:55.829543] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:56.015264] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.015415] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:56.201112] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:56.201265] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:56.385877] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.386028] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:56.571827] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.571978] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:20:56.758144] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.758298] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.944108] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:56.944261] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.130984] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:57.131137] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.317384] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:57.317544] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.503378] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.503531] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.689180] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.689334] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.874805] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:57.874960] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.059341] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.059494] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.244853] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.245006] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.430191] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.430345] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.616368] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:58.616516] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.803135] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:58.803288] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.988720] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:58.988873] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.174059] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:59.174209] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.368568] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.368721] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.554349] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.554501] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.740561] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:59.740712] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:20:59.925196] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:20:59.925350] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:00.128915] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:00.129067] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:00.313848] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:00.314001] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:00.508758] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:00.508910] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:00.713178] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:00.713330] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:00.897831] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:00.897983] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.082102] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.082254] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.266319] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.266473] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.450124] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.450278] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.633985] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.634140] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.810261] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.810413] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.993319] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.993472] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:21:01.993639] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:21:01.993756] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:21:16.114466] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:21:16.149163] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:21:16.149324] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:21:16.149456] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:21:27.439290] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:10,  7.93it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:43, 12.69it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:38, 14.29it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:36, 15.03it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<00:57,  9.52it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:10,  7.84it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:17,  7.04it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:03,  8.59it/s]Qunatization Process:   3%|▎         | 17/561 [00:01<00:53, 10.09it/s]Qunatization Process:   3%|▎         | 19/561 [00:01<00:47, 11.40it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:43, 12.51it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<00:57,  9.38it/s]Qunatization Process:   4%|▍         | 25/561 [00:02<01:07,  8.00it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:13,  7.25it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:01,  8.66it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<00:52, 10.04it/s]Qunatization Process:   6%|▌         | 33/561 [00:03<00:46, 11.30it/s]Qunatization Process:   6%|▌         | 35/561 [00:03<00:42, 12.40it/s]Qunatization Process:   7%|▋         | 37/561 [00:03<00:55,  9.39it/s]Qunatization Process:   7%|▋         | 39/561 [00:04<01:04,  8.03it/s]Qunatization Process:   7%|▋         | 41/561 [00:04<01:11,  7.30it/s]Qunatization Process:   8%|▊         | 43/561 [00:04<00:59,  8.72it/s]Qunatization Process:   8%|▊         | 45/561 [00:04<00:50, 10.13it/s]Qunatization Process:   8%|▊         | 47/561 [00:04<00:45, 11.37it/s]Qunatization Process:   9%|▊         | 49/561 [00:05<00:41, 12.47it/s]Qunatization Process:   9%|▉         | 51/561 [00:05<00:53,  9.50it/s]Qunatization Process:   9%|▉         | 53/561 [00:05<01:02,  8.07it/s]Qunatization Process:  10%|▉         | 55/561 [00:06<01:09,  7.30it/s]Qunatization Process:  10%|█         | 57/561 [00:06<00:57,  8.73it/s]Qunatization Process:  11%|█         | 59/561 [00:06<00:49, 10.10it/s]Qunatization Process:  11%|█         | 61/561 [00:06<00:43, 11.42it/s]Qunatization Process:  11%|█         | 63/561 [00:06<00:39, 12.51it/s]Qunatization Process:  12%|█▏        | 65/561 [00:06<00:52,  9.47it/s]Qunatization Process:  12%|█▏        | 67/561 [00:07<01:01,  8.04it/s]Qunatization Process:  12%|█▏        | 69/561 [00:07<01:12,  6.77it/s]Qunatization Process:  13%|█▎        | 71/561 [00:07<00:59,  8.20it/s]Qunatization Process:  13%|█▎        | 73/561 [00:07<00:50,  9.63it/s]Qunatization Process:  13%|█▎        | 75/561 [00:07<00:44, 10.98it/s]Qunatization Process:  14%|█▎        | 77/561 [00:08<00:39, 12.13it/s]Qunatization Process:  14%|█▍        | 79/561 [00:08<00:56,  8.47it/s]Qunatization Process:  14%|█▍        | 81/561 [00:08<01:10,  6.80it/s]Qunatization Process:  15%|█▍        | 83/561 [00:09<01:19,  5.99it/s]Qunatization Process:  15%|█▌        | 85/561 [00:09<01:04,  7.38it/s]Qunatization Process:  16%|█▌        | 87/561 [00:09<00:53,  8.85it/s]Qunatization Process:  16%|█▌        | 89/561 [00:09<00:46, 10.24it/s]Qunatization Process:  16%|█▌        | 91/561 [00:09<00:41, 11.32it/s]Qunatization Process:  17%|█▋        | 93/561 [00:10<00:58,  7.95it/s]Qunatization Process:  17%|█▋        | 95/561 [00:10<01:10,  6.57it/s]Qunatization Process:  17%|█▋        | 97/561 [00:11<01:19,  5.85it/s]Qunatization Process:  18%|█▊        | 99/561 [00:11<01:04,  7.16it/s]Qunatization Process:  18%|█▊        | 101/561 [00:11<00:54,  8.49it/s]Qunatization Process:  18%|█▊        | 103/561 [00:11<00:46,  9.76it/s]Qunatization Process:  19%|█▊        | 105/561 [00:11<00:41, 10.96it/s]Qunatization Process:  19%|█▉        | 107/561 [00:12<00:57,  7.84it/s]Qunatization Process:  19%|█▉        | 109/561 [00:12<01:09,  6.54it/s]Qunatization Process:  20%|█▉        | 111/561 [00:12<01:16,  5.85it/s]Qunatization Process:  20%|██        | 113/561 [00:13<01:02,  7.16it/s]Qunatization Process:  20%|██        | 115/561 [00:13<00:52,  8.50it/s]Qunatization Process:  21%|██        | 117/561 [00:13<00:45,  9.79it/s]Qunatization Process:  21%|██        | 119/561 [00:13<00:40, 10.97it/s]Qunatization Process:  22%|██▏       | 121/561 [00:13<00:56,  7.84it/s]Qunatization Process:  22%|██▏       | 123/561 [00:14<01:07,  6.52it/s]Qunatization Process:  22%|██▏       | 125/561 [00:14<01:14,  5.84it/s]Qunatization Process:  23%|██▎       | 127/561 [00:14<01:00,  7.14it/s]Qunatization Process:  23%|██▎       | 129/561 [00:14<00:50,  8.50it/s]Qunatization Process:  23%|██▎       | 131/561 [00:15<00:44,  9.74it/s]Qunatization Process:  24%|██▎       | 133/561 [00:15<00:39, 10.95it/s]Qunatization Process:  24%|██▍       | 135/561 [00:15<00:54,  7.82it/s]Qunatization Process:  24%|██▍       | 137/561 [00:16<01:05,  6.50it/s]Qunatization Process:  25%|██▍       | 139/561 [00:16<01:12,  5.83it/s]Qunatization Process:  25%|██▌       | 141/561 [00:16<00:58,  7.13it/s]Qunatization Process:  25%|██▌       | 143/561 [00:16<00:49,  8.48it/s]Qunatization Process:  26%|██▌       | 145/561 [00:16<00:42,  9.76it/s]Qunatization Process:  26%|██▌       | 147/561 [00:17<00:37, 10.94it/s]Qunatization Process:  27%|██▋       | 149/561 [00:17<00:52,  7.82it/s]Qunatization Process:  27%|██▋       | 151/561 [00:17<01:03,  6.51it/s]Qunatization Process:  27%|██▋       | 153/561 [00:18<01:11,  5.72it/s]Qunatization Process:  28%|██▊       | 155/561 [00:18<00:58,  6.97it/s]Qunatization Process:  28%|██▊       | 157/561 [00:18<00:49,  8.23it/s]Qunatization Process:  28%|██▊       | 159/561 [00:18<00:42,  9.45it/s]Qunatization Process:  29%|██▊       | 161/561 [00:18<00:38, 10.49it/s]Qunatization Process:  29%|██▉       | 163/561 [00:19<00:53,  7.50it/s]Qunatization Process:  29%|██▉       | 165/561 [00:19<01:03,  6.23it/s]Qunatization Process:  30%|██▉       | 167/561 [00:20<01:10,  5.58it/s]Qunatization Process:  30%|███       | 169/561 [00:20<00:57,  6.82it/s]Qunatization Process:  30%|███       | 171/561 [00:20<00:48,  8.06it/s]Qunatization Process:  31%|███       | 173/561 [00:20<00:41,  9.27it/s]Qunatization Process:  31%|███       | 175/561 [00:20<00:37, 10.36it/s]Qunatization Process:  32%|███▏      | 177/561 [00:21<00:51,  7.42it/s]Qunatization Process:  32%|███▏      | 179/561 [00:21<01:01,  6.19it/s]Qunatization Process:  32%|███▏      | 181/561 [00:22<01:08,  5.56it/s]Qunatization Process:  33%|███▎      | 183/561 [00:22<00:55,  6.79it/s]Qunatization Process:  33%|███▎      | 185/561 [00:22<00:46,  8.05it/s]Qunatization Process:  33%|███▎      | 187/561 [00:22<00:40,  9.26it/s]Qunatization Process:  34%|███▎      | 189/561 [00:22<00:36, 10.33it/s]Qunatization Process:  34%|███▍      | 191/561 [00:23<00:49,  7.44it/s]Qunatization Process:  34%|███▍      | 193/561 [00:23<00:59,  6.20it/s]Qunatization Process:  35%|███▍      | 195/561 [00:24<01:05,  5.56it/s]Qunatization Process:  35%|███▌      | 197/561 [00:24<00:53,  6.80it/s]Qunatization Process:  35%|███▌      | 199/561 [00:24<00:44,  8.06it/s]Qunatization Process:  36%|███▌      | 201/561 [00:24<00:38,  9.28it/s]Qunatization Process:  36%|███▌      | 203/561 [00:24<00:34, 10.35it/s]Qunatization Process:  37%|███▋      | 205/561 [00:25<00:47,  7.44it/s]Qunatization Process:  37%|███▋      | 207/561 [00:25<00:56,  6.21it/s]Qunatization Process:  37%|███▋      | 209/561 [00:25<01:03,  5.57it/s]Qunatization Process:  38%|███▊      | 211/561 [00:26<00:51,  6.81it/s]Qunatization Process:  38%|███▊      | 213/561 [00:26<00:43,  8.08it/s]Qunatization Process:  38%|███▊      | 215/561 [00:26<00:37,  9.27it/s]Qunatization Process:  39%|███▊      | 217/561 [00:26<00:33, 10.38it/s]Qunatization Process:  39%|███▉      | 219/561 [00:26<00:45,  7.44it/s]Qunatization Process:  39%|███▉      | 221/561 [00:27<00:54,  6.21it/s]Qunatization Process:  40%|███▉      | 223/561 [00:27<01:00,  5.57it/s]Qunatization Process:  40%|████      | 225/561 [00:28<00:49,  6.80it/s]Qunatization Process:  40%|████      | 227/561 [00:28<00:41,  8.08it/s]Qunatization Process:  41%|████      | 229/561 [00:28<00:35,  9.27it/s]Qunatization Process:  41%|████      | 231/561 [00:28<00:31, 10.37it/s]Qunatization Process:  42%|████▏     | 233/561 [00:28<00:44,  7.43it/s]Qunatization Process:  42%|████▏     | 235/561 [00:29<00:52,  6.21it/s]Qunatization Process:  42%|████▏     | 237/561 [00:29<00:58,  5.57it/s]Qunatization Process:  43%|████▎     | 239/561 [00:29<00:47,  6.80it/s]Qunatization Process:  43%|████▎     | 241/561 [00:30<00:39,  8.04it/s]Qunatization Process:  43%|████▎     | 243/561 [00:30<00:34,  9.25it/s]Qunatization Process:  44%|████▎     | 245/561 [00:30<00:30, 10.33it/s]Qunatization Process:  44%|████▍     | 247/561 [00:30<00:42,  7.45it/s]Qunatization Process:  44%|████▍     | 249/561 [00:31<00:50,  6.21it/s]Qunatization Process:  45%|████▍     | 251/561 [00:31<00:55,  5.57it/s]Qunatization Process:  45%|████▌     | 253/561 [00:31<00:45,  6.82it/s]Qunatization Process:  45%|████▌     | 255/561 [00:31<00:37,  8.09it/s]Qunatization Process:  46%|████▌     | 257/561 [00:32<00:32,  9.23it/s]Qunatization Process:  46%|████▌     | 259/561 [00:32<00:29, 10.34it/s]Qunatization Process:  47%|████▋     | 261/561 [00:32<00:40,  7.44it/s]Qunatization Process:  47%|████▋     | 263/561 [00:33<00:47,  6.21it/s]Qunatization Process:  47%|████▋     | 265/561 [00:33<00:53,  5.57it/s]Qunatization Process:  48%|████▊     | 267/561 [00:33<00:43,  6.76it/s]Qunatization Process:  48%|████▊     | 269/561 [00:33<00:36,  8.05it/s]Qunatization Process:  48%|████▊     | 271/561 [00:34<00:31,  9.23it/s]Qunatization Process:  49%|████▊     | 273/561 [00:34<00:27, 10.35it/s]Qunatization Process:  49%|████▉     | 275/561 [00:34<00:38,  7.44it/s]Qunatization Process:  49%|████▉     | 277/561 [00:35<00:45,  6.21it/s]Qunatization Process:  50%|████▉     | 279/561 [00:35<00:50,  5.57it/s]Qunatization Process:  50%|█████     | 281/561 [00:35<00:41,  6.82it/s]Qunatization Process:  50%|█████     | 283/561 [00:35<00:34,  8.09it/s]Qunatization Process:  51%|█████     | 285/561 [00:35<00:29,  9.31it/s]Qunatization Process:  51%|█████     | 287/561 [00:36<00:26, 10.38it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:36<00:36,  7.43it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:36<00:43,  6.19it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:37<00:48,  5.55it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:37<00:39,  6.80it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:37<00:32,  8.03it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:37<00:28,  9.25it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:37<00:25, 10.33it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:38<00:34,  7.42it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:38<00:41,  6.18it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:39<00:45,  5.56it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:39<00:36,  6.81it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:39<00:31,  8.05it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:39<00:26,  9.26it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:39<00:23, 10.37it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:40<00:32,  7.44it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:40<00:38,  6.21it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:41<00:43,  5.56it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:41<00:34,  6.81it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:41<00:29,  8.09it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:41<00:25,  9.32it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:41<00:22, 10.42it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:42<00:30,  7.47it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:42<00:36,  6.24it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:43<00:40,  5.59it/s]Qunatization Process:  60%|██████    | 337/561 [00:43<00:32,  6.83it/s]Qunatization Process:  60%|██████    | 339/561 [00:43<00:27,  8.09it/s]Qunatization Process:  61%|██████    | 341/561 [00:43<00:23,  9.29it/s]Qunatization Process:  61%|██████    | 343/561 [00:43<00:21, 10.37it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:44<00:28,  7.46it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:44<00:34,  6.21it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:44<00:38,  5.56it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:45<00:30,  6.81it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:45<00:25,  8.05it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:45<00:22,  9.27it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:45<00:19, 10.40it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:45<00:27,  7.46it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:46<00:32,  6.22it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:46<00:35,  5.57it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:47<00:28,  6.82it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:47<00:23,  8.09it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:47<00:20,  9.28it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:47<00:18, 10.36it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:47<00:25,  7.45it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:48<00:29,  6.21it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:48<00:33,  5.57it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:48<00:26,  6.82it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:49<00:22,  8.09it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:49<00:19,  9.31it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:49<00:16, 10.38it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:49<00:23,  7.45it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:50<00:27,  6.23it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:50<00:30,  5.58it/s]Qunatization Process:  70%|███████   | 393/561 [00:50<00:24,  6.83it/s]Qunatization Process:  70%|███████   | 395/561 [00:50<00:20,  8.10it/s]Qunatization Process:  71%|███████   | 397/561 [00:51<00:17,  9.33it/s]Qunatization Process:  71%|███████   | 399/561 [00:51<00:15, 10.38it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:51<00:21,  7.44it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:52<00:25,  6.21it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:52<00:28,  5.56it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:52<00:22,  6.80it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:52<00:18,  8.05it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:53<00:16,  9.25it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:53<00:14, 10.37it/s]Qunatization Process:  74%|███████▍  | 415/561 [00:53<00:19,  7.44it/s]Qunatization Process:  74%|███████▍  | 417/561 [00:54<00:23,  6.21it/s]Qunatization Process:  75%|███████▍  | 419/561 [00:54<00:25,  5.56it/s]Qunatization Process:  75%|███████▌  | 421/561 [00:54<00:20,  6.80it/s]Qunatization Process:  75%|███████▌  | 423/561 [00:54<00:17,  8.08it/s]Qunatization Process:  76%|███████▌  | 425/561 [00:54<00:14,  9.29it/s]Qunatization Process:  76%|███████▌  | 427/561 [00:55<00:12, 10.39it/s]Qunatization Process:  76%|███████▋  | 429/561 [00:55<00:17,  7.41it/s]Qunatization Process:  77%|███████▋  | 431/561 [00:55<00:20,  6.20it/s]Qunatization Process:  77%|███████▋  | 433/561 [00:56<00:23,  5.55it/s]Qunatization Process:  78%|███████▊  | 435/561 [00:56<00:18,  6.79it/s]Qunatization Process:  78%|███████▊  | 437/561 [00:56<00:15,  8.05it/s]Qunatization Process:  78%|███████▊  | 439/561 [00:56<00:13,  9.24it/s]Qunatization Process:  79%|███████▊  | 441/561 [00:56<00:11, 10.32it/s]Qunatization Process:  79%|███████▉  | 443/561 [00:57<00:15,  7.44it/s]Qunatization Process:  79%|███████▉  | 445/561 [00:57<00:18,  6.21it/s]Qunatization Process:  80%|███████▉  | 447/561 [00:58<00:20,  5.53it/s]Qunatization Process:  80%|████████  | 449/561 [00:58<00:16,  6.78it/s]Qunatization Process:  80%|████████  | 451/561 [00:58<00:13,  8.03it/s]Qunatization Process:  81%|████████  | 453/561 [00:58<00:11,  9.27it/s]Qunatization Process:  81%|████████  | 455/561 [00:58<00:10, 10.37it/s]Qunatization Process:  81%|████████▏ | 457/561 [00:59<00:13,  7.44it/s]Qunatization Process:  82%|████████▏ | 459/561 [00:59<00:16,  6.22it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:00<00:17,  5.57it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:00<00:14,  6.80it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:00<00:11,  8.07it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:00<00:10,  9.29it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:00<00:08, 10.39it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:01<00:12,  7.47it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:01<00:14,  6.23it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:02<00:15,  5.59it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:02<00:12,  6.83it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:02<00:10,  8.09it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:02<00:08,  9.30it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:02<00:07, 10.39it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:03<00:10,  7.45it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:03<00:11,  6.21it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:03<00:12,  5.57it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:04<00:10,  6.73it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:04<00:08,  8.00it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:04<00:07,  9.23it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:04<00:06, 10.27it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:05<00:08,  7.40it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:05<00:09,  6.19it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:05<00:10,  5.56it/s]Qunatization Process:  90%|█████████ | 505/561 [01:06<00:08,  6.80it/s]Qunatization Process:  90%|█████████ | 507/561 [01:06<00:06,  8.08it/s]Qunatization Process:  91%|█████████ | 509/561 [01:06<00:05,  9.31it/s]Qunatization Process:  91%|█████████ | 511/561 [01:06<00:04, 10.38it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:06<00:06,  7.47it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:07<00:07,  6.22it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:07<00:07,  5.58it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:07<00:06,  6.83it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:08<00:04,  8.07it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:08<00:04,  9.26it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:08<00:03, 10.36it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:08<00:04,  7.45it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:09<00:05,  6.22it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:09<00:05,  5.58it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:09<00:04,  6.82it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:09<00:03,  8.08it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:10<00:02,  9.27it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:10<00:02, 10.36it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:10<00:02,  7.44it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:11<00:02,  6.23it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:11<00:02,  5.57it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:11<00:02,  6.82it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:11<00:01,  8.11it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:12<00:01,  9.33it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:12<00:00, 10.45it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:12<00:00,  7.48it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:13<00:00,  6.23it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:13<00:00,  5.59it/s]Qunatization Process: 100%|██████████| 561/561 [01:14<00:00,  3.58it/s]Qunatization Process: 100%|██████████| 561/561 [01:14<00:00,  7.53it/s]
+[00:23:01.170544] ## Processing on RANK 5.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:08<00:08,  8.95s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.46s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.83s/it]
+[00:24:47.947330] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:25:02.269081] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:25:02.692693] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:02.692878] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:25:02.914876] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:02.915062] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.101206] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.101384] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.288335] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.288501] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.475855] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:03.476024] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.663914] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:03.664087] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:03.856981] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:03.857148] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:04.058120] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:04.058293] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:04.259333] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:04.259499] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:04.461593] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:04.461758] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:04.663516] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:04.663684] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:25:04.865733] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:04.865899] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.067959] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.068129] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.270182] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:05.270342] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.485843] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:05.486009] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.701905] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.702072] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.917779] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:05.917958] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.133808] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.133982] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.350227] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.350395] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.567662] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.567828] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.782865] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.783030] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:06.997998] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:06.998163] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.213300] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:07.213461] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.429073] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.429236] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.644352] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:07.644518] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.859237] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:07.859403] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.084864] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.085034] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.299635] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:08.299795] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.514314] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:08.514478] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.739779] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:08.739941] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:08.997689] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:08.997855] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:09.265989] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:09.266155] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:09.534108] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:09.534269] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:09.782098] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:09.782256] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:10.029110] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:10.029273] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:10.265633] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:10.265797] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:10.511386] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:10.511549] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:10.757811] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:10.757971] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:11.004191] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:11.004352] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:11.251230] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:11.251392] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:25:11.251564] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:25:11.251690] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:25:25.961520] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:25:25.989821] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:25:25.989980] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:25:25.990112] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:25:40.350502] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:19,  7.05it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:48, 11.62it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:42, 13.09it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:39, 13.86it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<01:03,  8.74it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:15,  7.27it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:28,  6.20it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:13,  7.39it/s]Qunatization Process:   3%|▎         | 17/561 [00:02<01:04,  8.47it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<00:57,  9.43it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:53, 10.14it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<01:10,  7.60it/s]Qunatization Process:   4%|▍         | 25/561 [00:03<01:24,  6.34it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:36,  5.53it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:19,  6.68it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<01:08,  7.76it/s]Qunatization Process:   6%|▌         | 33/561 [00:04<01:00,  8.75it/s]Qunatization Process:   6%|▌         | 35/561 [00:04<00:54,  9.70it/s]Qunatization Process:   7%|▋         | 37/561 [00:04<01:12,  7.25it/s]Qunatization Process:   7%|▋         | 39/561 [00:05<01:30,  5.80it/s]Qunatization Process:   7%|▋         | 41/561 [00:05<01:40,  5.19it/s]Qunatization Process:   8%|▊         | 43/561 [00:05<01:22,  6.30it/s]Qunatization Process:   8%|▊         | 45/561 [00:06<01:08,  7.57it/s]Qunatization Process:   8%|▊         | 47/561 [00:06<00:58,  8.74it/s]Qunatization Process:   9%|▊         | 49/561 [00:06<00:52,  9.84it/s]Qunatization Process:   9%|▉         | 51/561 [00:06<01:10,  7.19it/s]Qunatization Process:   9%|▉         | 53/561 [00:07<01:24,  6.03it/s]Qunatization Process:  10%|▉         | 55/561 [00:07<01:36,  5.26it/s]Qunatization Process:  10%|█         | 57/561 [00:07<01:19,  6.33it/s]Qunatization Process:  11%|█         | 59/561 [00:08<01:07,  7.39it/s]Qunatization Process:  11%|█         | 61/561 [00:08<00:59,  8.38it/s]Qunatization Process:  11%|█         | 63/561 [00:08<00:53,  9.23it/s]Qunatization Process:  12%|█▏        | 65/561 [00:08<01:16,  6.50it/s]Qunatization Process:  12%|█▏        | 67/561 [00:09<01:32,  5.35it/s]Qunatization Process:  12%|█▏        | 69/561 [00:09<01:42,  4.81it/s]Qunatization Process:  13%|█▎        | 71/561 [00:10<01:23,  5.86it/s]Qunatization Process:  13%|█▎        | 73/561 [00:10<01:10,  6.93it/s]Qunatization Process:  13%|█▎        | 75/561 [00:10<01:01,  7.91it/s]Qunatization Process:  14%|█▎        | 77/561 [00:10<00:54,  8.85it/s]Qunatization Process:  14%|█▍        | 79/561 [00:11<01:15,  6.40it/s]Qunatization Process:  14%|█▍        | 81/561 [00:11<01:29,  5.34it/s]Qunatization Process:  15%|█▍        | 83/561 [00:12<01:39,  4.82it/s]Qunatization Process:  15%|█▌        | 85/561 [00:12<01:20,  5.88it/s]Qunatization Process:  16%|█▌        | 87/561 [00:12<01:08,  6.97it/s]Qunatization Process:  16%|█▌        | 89/561 [00:12<00:59,  7.99it/s]Qunatization Process:  16%|█▌        | 91/561 [00:12<00:53,  8.81it/s]Qunatization Process:  17%|█▋        | 93/561 [00:13<01:12,  6.43it/s]Qunatization Process:  17%|█▋        | 95/561 [00:13<01:26,  5.39it/s]Qunatization Process:  17%|█▋        | 97/561 [00:14<01:36,  4.83it/s]Qunatization Process:  18%|█▊        | 99/561 [00:14<01:18,  5.85it/s]Qunatization Process:  18%|█▊        | 101/561 [00:14<01:06,  6.89it/s]Qunatization Process:  18%|█▊        | 103/561 [00:14<00:57,  7.91it/s]Qunatization Process:  19%|█▊        | 105/561 [00:15<00:51,  8.77it/s]Qunatization Process:  19%|█▉        | 107/561 [00:15<01:11,  6.34it/s]Qunatization Process:  19%|█▉        | 109/561 [00:16<01:24,  5.38it/s]Qunatization Process:  20%|█▉        | 111/561 [00:16<01:32,  4.86it/s]Qunatization Process:  20%|██        | 113/561 [00:16<01:16,  5.89it/s]Qunatization Process:  20%|██        | 115/561 [00:16<01:04,  6.93it/s]Qunatization Process:  21%|██        | 117/561 [00:17<00:55,  7.96it/s]Qunatization Process:  21%|██        | 119/561 [00:17<00:49,  8.89it/s]Qunatization Process:  22%|██▏       | 121/561 [00:17<01:08,  6.40it/s]Qunatization Process:  22%|██▏       | 123/561 [00:18<01:21,  5.35it/s]Qunatization Process:  22%|██▏       | 125/561 [00:18<01:31,  4.79it/s]Qunatization Process:  23%|██▎       | 127/561 [00:18<01:14,  5.85it/s]Qunatization Process:  23%|██▎       | 129/561 [00:19<01:00,  7.09it/s]Qunatization Process:  23%|██▎       | 131/561 [00:19<00:51,  8.31it/s]Qunatization Process:  24%|██▎       | 133/561 [00:19<00:45,  9.44it/s]Qunatization Process:  24%|██▍       | 135/561 [00:19<01:00,  7.03it/s]Qunatization Process:  24%|██▍       | 137/561 [00:20<01:11,  5.96it/s]Qunatization Process:  25%|██▍       | 139/561 [00:20<01:20,  5.24it/s]Qunatization Process:  25%|██▌       | 141/561 [00:20<01:07,  6.23it/s]Qunatization Process:  25%|██▌       | 143/561 [00:21<00:57,  7.27it/s]Qunatization Process:  26%|██▌       | 145/561 [00:21<00:50,  8.25it/s]Qunatization Process:  26%|██▌       | 147/561 [00:21<00:45,  9.02it/s]Qunatization Process:  27%|██▋       | 149/561 [00:21<01:03,  6.46it/s]Qunatization Process:  27%|██▋       | 151/561 [00:22<01:16,  5.36it/s]Qunatization Process:  27%|██▋       | 153/561 [00:22<01:25,  4.79it/s]Qunatization Process:  28%|██▊       | 155/561 [00:23<01:09,  5.83it/s]Qunatization Process:  28%|██▊       | 157/561 [00:23<00:58,  6.87it/s]Qunatization Process:  28%|██▊       | 159/561 [00:23<00:50,  7.91it/s]Qunatization Process:  29%|██▊       | 161/561 [00:23<00:45,  8.85it/s]Qunatization Process:  29%|██▉       | 163/561 [00:24<01:01,  6.51it/s]Qunatization Process:  29%|██▉       | 165/561 [00:24<01:10,  5.65it/s]Qunatization Process:  30%|██▉       | 167/561 [00:25<01:16,  5.17it/s]Qunatization Process:  30%|███       | 169/561 [00:25<01:01,  6.33it/s]Qunatization Process:  30%|███       | 171/561 [00:25<00:51,  7.55it/s]Qunatization Process:  31%|███       | 173/561 [00:25<00:44,  8.76it/s]Qunatization Process:  31%|███       | 175/561 [00:25<00:39,  9.87it/s]Qunatization Process:  32%|███▏      | 177/561 [00:26<00:53,  7.22it/s]Qunatization Process:  32%|███▏      | 179/561 [00:26<01:03,  6.06it/s]Qunatization Process:  32%|███▏      | 181/561 [00:27<01:10,  5.41it/s]Qunatization Process:  33%|███▎      | 183/561 [00:27<00:57,  6.62it/s]Qunatization Process:  33%|███▎      | 185/561 [00:27<00:47,  7.86it/s]Qunatization Process:  33%|███▎      | 187/561 [00:27<00:41,  8.98it/s]Qunatization Process:  34%|███▎      | 189/561 [00:27<00:37,  9.90it/s]Qunatization Process:  34%|███▍      | 191/561 [00:28<00:51,  7.20it/s]Qunatization Process:  34%|███▍      | 193/561 [00:28<01:02,  5.88it/s]Qunatization Process:  35%|███▍      | 195/561 [00:29<01:12,  5.05it/s]Qunatization Process:  35%|███▌      | 197/561 [00:29<00:59,  6.10it/s]Qunatization Process:  35%|███▌      | 199/561 [00:29<00:50,  7.17it/s]Qunatization Process:  36%|███▌      | 201/561 [00:29<00:44,  8.12it/s]Qunatization Process:  36%|███▌      | 203/561 [00:29<00:38,  9.20it/s]Qunatization Process:  37%|███▋      | 205/561 [00:30<00:51,  6.85it/s]Qunatization Process:  37%|███▋      | 207/561 [00:30<01:01,  5.80it/s]Qunatization Process:  37%|███▋      | 209/561 [00:31<01:09,  5.03it/s]Qunatization Process:  38%|███▊      | 211/561 [00:31<00:57,  6.09it/s]Qunatization Process:  38%|███▊      | 213/561 [00:31<00:48,  7.13it/s]Qunatization Process:  38%|███▊      | 215/561 [00:31<00:42,  8.12it/s]Qunatization Process:  39%|███▊      | 217/561 [00:31<00:38,  8.99it/s]Qunatization Process:  39%|███▉      | 219/561 [00:32<00:52,  6.55it/s]Qunatization Process:  39%|███▉      | 221/561 [00:32<01:02,  5.40it/s]Qunatization Process:  40%|███▉      | 223/561 [00:33<01:06,  5.06it/s]Qunatization Process:  40%|████      | 225/561 [00:33<00:53,  6.25it/s]Qunatization Process:  40%|████      | 227/561 [00:33<00:44,  7.51it/s]Qunatization Process:  41%|████      | 229/561 [00:33<00:38,  8.72it/s]Qunatization Process:  41%|████      | 231/561 [00:33<00:33,  9.79it/s]Qunatization Process:  42%|████▏     | 233/561 [00:34<00:45,  7.14it/s]Qunatization Process:  42%|████▏     | 235/561 [00:34<00:57,  5.70it/s]Qunatization Process:  42%|████▏     | 237/561 [00:35<01:01,  5.25it/s]Qunatization Process:  43%|████▎     | 239/561 [00:35<00:52,  6.12it/s]Qunatization Process:  43%|████▎     | 241/561 [00:35<00:43,  7.37it/s]Qunatization Process:  43%|████▎     | 243/561 [00:35<00:36,  8.62it/s]Qunatization Process:  44%|████▎     | 245/561 [00:35<00:32,  9.75it/s]Qunatization Process:  44%|████▍     | 247/561 [00:36<00:44,  7.13it/s]Qunatization Process:  44%|████▍     | 249/561 [00:36<00:54,  5.70it/s]Qunatization Process:  45%|████▍     | 251/561 [00:37<01:02,  4.98it/s]Qunatization Process:  45%|████▌     | 253/561 [00:37<00:50,  6.05it/s]Qunatization Process:  45%|████▌     | 255/561 [00:37<00:43,  7.09it/s]Qunatization Process:  46%|████▌     | 257/561 [00:37<00:37,  8.09it/s]Qunatization Process:  46%|████▌     | 259/561 [00:38<00:33,  8.88it/s]Qunatization Process:  47%|████▋     | 261/561 [00:38<00:46,  6.41it/s]Qunatization Process:  47%|████▋     | 263/561 [00:39<00:53,  5.62it/s]Qunatization Process:  47%|████▋     | 265/561 [00:39<00:57,  5.18it/s]Qunatization Process:  48%|████▊     | 267/561 [00:39<00:45,  6.39it/s]Qunatization Process:  48%|████▊     | 269/561 [00:39<00:38,  7.65it/s]Qunatization Process:  48%|████▊     | 271/561 [00:39<00:32,  8.85it/s]Qunatization Process:  49%|████▊     | 273/561 [00:40<00:28,  9.99it/s]Qunatization Process:  49%|████▉     | 275/561 [00:40<00:41,  6.84it/s]Qunatization Process:  49%|████▉     | 277/561 [00:41<00:51,  5.48it/s]Qunatization Process:  50%|████▉     | 279/561 [00:41<00:58,  4.85it/s]Qunatization Process:  50%|█████     | 281/561 [00:41<00:47,  5.86it/s]Qunatization Process:  50%|█████     | 283/561 [00:42<00:40,  6.91it/s]Qunatization Process:  51%|█████     | 285/561 [00:42<00:35,  7.88it/s]Qunatization Process:  51%|█████     | 287/561 [00:42<00:31,  8.77it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:42<00:42,  6.42it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:43<00:50,  5.39it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:43<00:55,  4.85it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:44<00:45,  5.90it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:44<00:38,  6.91it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:44<00:33,  7.93it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:44<00:29,  8.83it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:45<00:40,  6.41it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:45<00:47,  5.38it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:46<00:52,  4.82it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:46<00:43,  5.80it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:46<00:36,  6.87it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:46<00:31,  7.89it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:46<00:28,  8.78it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:47<00:38,  6.35it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:47<00:45,  5.34it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:48<00:50,  4.77it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:48<00:40,  5.82it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:48<00:34,  6.85it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:48<00:29,  7.85it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:49<00:26,  8.75it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:49<00:36,  6.37it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:50<00:42,  5.34it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:50<00:45,  5.00it/s]Qunatization Process:  60%|██████    | 337/561 [00:50<00:37,  6.04it/s]Qunatization Process:  60%|██████    | 339/561 [00:50<00:31,  7.00it/s]Qunatization Process:  61%|██████    | 341/561 [00:51<00:27,  7.99it/s]Qunatization Process:  61%|██████    | 343/561 [00:51<00:24,  8.72it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:51<00:34,  6.34it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:52<00:40,  5.29it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:52<00:44,  4.80it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:52<00:35,  5.96it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:53<00:28,  7.22it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:53<00:24,  8.45it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:53<00:21,  9.59it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:53<00:28,  7.08it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:54<00:35,  5.70it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:54<00:39,  5.00it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:55<00:34,  5.76it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:55<00:28,  6.83it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:55<00:24,  7.79it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:55<00:21,  8.76it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:56<00:29,  6.36it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:56<00:35,  5.31it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:57<00:38,  4.77it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:57<00:31,  5.83it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:57<00:25,  6.95it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:57<00:21,  8.17it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:57<00:18,  9.33it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:58<00:25,  6.95it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:58<00:31,  5.52it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:59<00:34,  4.89it/s]Qunatization Process:  70%|███████   | 393/561 [00:59<00:28,  5.93it/s]Qunatization Process:  70%|███████   | 395/561 [00:59<00:23,  7.03it/s]Qunatization Process:  71%|███████   | 397/561 [00:59<00:20,  8.05it/s]Qunatization Process:  71%|███████   | 399/561 [00:59<00:18,  8.95it/s]Qunatization Process:  71%|███████▏  | 401/561 [01:00<00:24,  6.49it/s]Qunatization Process:  72%|███████▏  | 403/561 [01:00<00:29,  5.36it/s]Qunatization Process:  72%|███████▏  | 405/561 [01:01<00:32,  4.83it/s]Qunatization Process:  73%|███████▎  | 407/561 [01:01<00:26,  5.90it/s]Qunatization Process:  73%|███████▎  | 409/561 [01:01<00:21,  6.98it/s]Qunatization Process:  73%|███████▎  | 411/561 [01:01<00:18,  8.20it/s]Qunatization Process:  74%|███████▎  | 413/561 [01:02<00:15,  9.36it/s]Qunatization Process:  74%|███████▍  | 415/561 [01:02<00:20,  7.02it/s]Qunatization Process:  74%|███████▍  | 417/561 [01:02<00:24,  5.97it/s]Qunatization Process:  75%|███████▍  | 419/561 [01:03<00:26,  5.40it/s]Qunatization Process:  75%|███████▌  | 421/561 [01:03<00:21,  6.52it/s]Qunatization Process:  75%|███████▌  | 423/561 [01:03<00:17,  7.74it/s]Qunatization Process:  76%|███████▌  | 425/561 [01:03<00:15,  8.94it/s]Qunatization Process:  76%|███████▌  | 427/561 [01:03<00:13, 10.03it/s]Qunatization Process:  76%|███████▋  | 429/561 [01:04<00:18,  7.21it/s]Qunatization Process:  77%|███████▋  | 431/561 [01:04<00:21,  6.05it/s]Qunatization Process:  77%|███████▋  | 433/561 [01:05<00:23,  5.43it/s]Qunatization Process:  78%|███████▊  | 435/561 [01:05<00:18,  6.64it/s]Qunatization Process:  78%|███████▊  | 437/561 [01:05<00:15,  7.89it/s]Qunatization Process:  78%|███████▊  | 439/561 [01:05<00:13,  9.09it/s]Qunatization Process:  79%|███████▊  | 441/561 [01:05<00:11, 10.14it/s]Qunatization Process:  79%|███████▉  | 443/561 [01:06<00:17,  6.79it/s]Qunatization Process:  79%|███████▉  | 445/561 [01:06<00:20,  5.54it/s]Qunatization Process:  80%|███████▉  | 447/561 [01:07<00:23,  4.91it/s]Qunatization Process:  80%|████████  | 449/561 [01:07<00:18,  5.99it/s]Qunatization Process:  80%|████████  | 451/561 [01:07<00:15,  6.99it/s]Qunatization Process:  81%|████████  | 453/561 [01:07<00:13,  8.01it/s]Qunatization Process:  81%|████████  | 455/561 [01:08<00:11,  8.93it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:08<00:16,  6.34it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:09<00:19,  5.31it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:09<00:20,  4.85it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:09<00:16,  6.04it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:09<00:13,  7.30it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:10<00:11,  8.44it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:10<00:09,  9.59it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:10<00:13,  6.73it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:11<00:15,  5.55it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:11<00:17,  4.94it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:11<00:13,  6.01it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:12<00:11,  7.01it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:12<00:09,  8.04it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:12<00:08,  8.96it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:12<00:11,  6.50it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:13<00:13,  5.44it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:13<00:14,  4.90it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:14<00:11,  6.02it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:14<00:09,  7.28it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:14<00:07,  8.52it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:14<00:06,  9.64it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:14<00:08,  7.05it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:15<00:10,  5.91it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:15<00:10,  5.35it/s]Qunatization Process:  90%|█████████ | 505/561 [01:16<00:08,  6.58it/s]Qunatization Process:  90%|█████████ | 507/561 [01:16<00:06,  7.83it/s]Qunatization Process:  91%|█████████ | 509/561 [01:16<00:05,  9.01it/s]Qunatization Process:  91%|█████████ | 511/561 [01:16<00:04, 10.09it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:16<00:06,  7.29it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:17<00:07,  6.11it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:17<00:08,  5.47it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:17<00:06,  6.69it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:18<00:05,  7.95it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:18<00:04,  8.86it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:18<00:03,  9.65it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:18<00:05,  6.76it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:19<00:05,  5.79it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:19<00:05,  5.26it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:20<00:04,  6.42it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:20<00:03,  7.69it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:20<00:02,  8.79it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:20<00:02,  9.92it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:20<00:02,  7.16it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:21<00:03,  5.98it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:21<00:03,  5.32it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:22<00:02,  6.54it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:22<00:01,  7.70it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:22<00:01,  8.94it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:22<00:00, 10.09it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:22<00:00,  7.26it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:23<00:00,  6.11it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:23<00:00,  5.52it/s]Qunatization Process: 100%|██████████| 561/561 [01:24<00:00,  3.54it/s]Qunatization Process: 100%|██████████| 561/561 [01:24<00:00,  6.61it/s]
+[00:27:27.615462] ## Processing on RANK 6.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:08<00:08,  8.23s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.59s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.83s/it]
+[00:29:14.300653] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:29:28.599071] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:29:28.859941] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:28.860112] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:29:29.049338] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:29.049495] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.237754] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.237908] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.428202] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.428358] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.607118] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:29.607266] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:29.797249] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:29.797401] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:30.026554] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:30.026709] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:30.259494] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:30.259642] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:30.492411] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:30.492571] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:30.725195] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:30.725350] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:30.958089] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:30.958244] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:29:31.190786] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:31.190938] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:31.425003] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:31.425157] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:31.657254] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:31.657402] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:31.889678] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:31.889829] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.122451] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.122607] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.355049] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.355201] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.587373] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.587525] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.823688] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:32.823838] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.061962] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.062114] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.293887] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.294038] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.533138] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:33.533291] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:33.773687] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:33.773851] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.007942] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.008117] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.239747] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:34.239901] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.471352] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.471512] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.689104] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.689292] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:34.886033] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:34.886199] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:35.072912] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:35.073078] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:35.257963] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:35.258124] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:35.462997] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:35.463166] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:35.656873] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:35.657034] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:35.861217] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:35.861383] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:36.063812] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:36.063979] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:36.312738] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:36.312914] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:36.549632] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:36.549797] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:36.778249] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:36.778412] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:37.007103] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:37.007267] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:37.235613] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:37.235784] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:37.464359] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:37.464515] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:29:37.464701] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:29:37.464844] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:29:52.364562] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:29:52.364788] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:29:52.364922] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:29:52.365055] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:30:06.992845] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<02:14,  4.17it/s]Qunatization Process:   1%|          | 3/561 [00:00<01:14,  7.44it/s]Qunatization Process:   1%|          | 5/561 [00:00<01:00,  9.17it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:53, 10.45it/s]Qunatization Process:   2%|▏         | 9/561 [00:01<01:15,  7.35it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:25,  6.46it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:36,  5.70it/s]Qunatization Process:   3%|▎         | 15/561 [00:02<01:18,  6.92it/s]Qunatization Process:   3%|▎         | 17/561 [00:02<01:07,  8.09it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<00:59,  9.10it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:55,  9.66it/s]Qunatization Process:   4%|▍         | 23/561 [00:03<01:16,  7.05it/s]Qunatization Process:   4%|▍         | 25/561 [00:03<01:30,  5.92it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:37,  5.50it/s]Qunatization Process:   5%|▌         | 29/561 [00:04<01:18,  6.77it/s]Qunatization Process:   6%|▌         | 31/561 [00:04<01:05,  8.09it/s]Qunatization Process:   6%|▌         | 33/561 [00:04<00:56,  9.37it/s]Qunatization Process:   6%|▌         | 35/561 [00:04<00:49, 10.53it/s]Qunatization Process:   7%|▋         | 37/561 [00:05<01:13,  7.13it/s]Qunatization Process:   7%|▋         | 39/561 [00:05<01:26,  6.07it/s]Qunatization Process:   7%|▋         | 41/561 [00:05<01:31,  5.66it/s]Qunatization Process:   8%|▊         | 43/561 [00:06<01:16,  6.80it/s]Qunatization Process:   8%|▊         | 45/561 [00:06<01:05,  7.89it/s]Qunatization Process:   8%|▊         | 47/561 [00:06<00:57,  8.94it/s]Qunatization Process:   9%|▊         | 49/561 [00:06<00:51,  9.85it/s]Qunatization Process:   9%|▉         | 51/561 [00:06<01:11,  7.15it/s]Qunatization Process:   9%|▉         | 53/561 [00:07<01:23,  6.09it/s]Qunatization Process:  10%|▉         | 55/561 [00:07<01:31,  5.50it/s]Qunatization Process:  10%|█         | 57/561 [00:07<01:15,  6.65it/s]Qunatization Process:  11%|█         | 59/561 [00:08<01:04,  7.76it/s]Qunatization Process:  11%|█         | 61/561 [00:08<00:57,  8.71it/s]Qunatization Process:  11%|█         | 63/561 [00:08<00:51,  9.70it/s]Qunatization Process:  12%|█▏        | 65/561 [00:08<01:14,  6.68it/s]Qunatization Process:  12%|█▏        | 67/561 [00:09<01:29,  5.53it/s]Qunatization Process:  12%|█▏        | 69/561 [00:09<01:35,  5.13it/s]Qunatization Process:  13%|█▎        | 71/561 [00:10<01:17,  6.36it/s]Qunatization Process:  13%|█▎        | 73/561 [00:10<01:04,  7.61it/s]Qunatization Process:  13%|█▎        | 75/561 [00:10<00:54,  8.85it/s]Qunatization Process:  14%|█▎        | 77/561 [00:10<00:48,  9.95it/s]Qunatization Process:  14%|█▍        | 79/561 [00:10<01:06,  7.24it/s]Qunatization Process:  14%|█▍        | 81/561 [00:11<01:19,  6.02it/s]Qunatization Process:  15%|█▍        | 83/561 [00:11<01:32,  5.16it/s]Qunatization Process:  15%|█▌        | 85/561 [00:12<01:16,  6.22it/s]Qunatization Process:  16%|█▌        | 87/561 [00:12<01:05,  7.28it/s]Qunatization Process:  16%|█▌        | 89/561 [00:12<00:57,  8.25it/s]Qunatization Process:  16%|█▌        | 91/561 [00:12<00:51,  9.07it/s]Qunatization Process:  17%|█▋        | 93/561 [00:13<01:08,  6.84it/s]Qunatization Process:  17%|█▋        | 95/561 [00:13<01:19,  5.87it/s]Qunatization Process:  17%|█▋        | 97/561 [00:14<01:29,  5.19it/s]Qunatization Process:  18%|█▊        | 99/561 [00:14<01:15,  6.14it/s]Qunatization Process:  18%|█▊        | 101/561 [00:14<01:04,  7.18it/s]Qunatization Process:  18%|█▊        | 103/561 [00:14<00:56,  8.14it/s]Qunatization Process:  19%|█▊        | 105/561 [00:14<00:50,  8.95it/s]Qunatization Process:  19%|█▉        | 107/561 [00:15<01:10,  6.40it/s]Qunatization Process:  19%|█▉        | 109/561 [00:15<01:23,  5.44it/s]Qunatization Process:  20%|█▉        | 111/561 [00:16<01:33,  4.83it/s]Qunatization Process:  20%|██        | 113/561 [00:16<01:15,  5.91it/s]Qunatization Process:  20%|██        | 115/561 [00:16<01:08,  6.52it/s]Qunatization Process:  21%|██        | 117/561 [00:16<00:57,  7.74it/s]Qunatization Process:  21%|██        | 119/561 [00:16<00:49,  8.92it/s]Qunatization Process:  22%|██▏       | 121/561 [00:17<01:04,  6.83it/s]Qunatization Process:  22%|██▏       | 123/561 [00:17<01:17,  5.66it/s]Qunatization Process:  22%|██▏       | 125/561 [00:18<01:27,  4.99it/s]Qunatization Process:  23%|██▎       | 127/561 [00:18<01:11,  6.05it/s]Qunatization Process:  23%|██▎       | 129/561 [00:18<01:00,  7.11it/s]Qunatization Process:  23%|██▎       | 131/561 [00:18<00:53,  8.02it/s]Qunatization Process:  24%|██▎       | 133/561 [00:19<00:48,  8.90it/s]Qunatization Process:  24%|██▍       | 135/561 [00:19<01:06,  6.45it/s]Qunatization Process:  24%|██▍       | 137/561 [00:20<01:18,  5.41it/s]Qunatization Process:  25%|██▍       | 139/561 [00:20<01:26,  4.85it/s]Qunatization Process:  25%|██▌       | 141/561 [00:20<01:09,  6.02it/s]Qunatization Process:  25%|██▌       | 143/561 [00:20<00:57,  7.24it/s]Qunatization Process:  26%|██▌       | 145/561 [00:21<00:49,  8.47it/s]Qunatization Process:  26%|██▌       | 147/561 [00:21<00:43,  9.56it/s]Qunatization Process:  27%|██▋       | 149/561 [00:21<01:01,  6.68it/s]Qunatization Process:  27%|██▋       | 151/561 [00:22<01:15,  5.46it/s]Qunatization Process:  27%|██▋       | 153/561 [00:22<01:24,  4.83it/s]Qunatization Process:  28%|██▊       | 155/561 [00:22<01:08,  5.96it/s]Qunatization Process:  28%|██▊       | 157/561 [00:23<00:56,  7.18it/s]Qunatization Process:  28%|██▊       | 159/561 [00:23<00:47,  8.43it/s]Qunatization Process:  29%|██▊       | 161/561 [00:23<00:41,  9.56it/s]Qunatization Process:  29%|██▉       | 163/561 [00:23<00:56,  7.06it/s]Qunatization Process:  29%|██▉       | 165/561 [00:24<01:06,  5.93it/s]Qunatization Process:  30%|██▉       | 167/561 [00:24<01:13,  5.35it/s]Qunatization Process:  30%|███       | 169/561 [00:24<00:59,  6.55it/s]Qunatization Process:  30%|███       | 171/561 [00:24<00:50,  7.80it/s]Qunatization Process:  31%|███       | 173/561 [00:25<00:43,  9.01it/s]Qunatization Process:  31%|███       | 175/561 [00:25<00:38, 10.06it/s]Qunatization Process:  32%|███▏      | 177/561 [00:25<00:53,  7.12it/s]Qunatization Process:  32%|███▏      | 179/561 [00:26<01:04,  5.95it/s]Qunatization Process:  32%|███▏      | 181/561 [00:26<01:10,  5.39it/s]Qunatization Process:  33%|███▎      | 183/561 [00:26<00:57,  6.60it/s]Qunatization Process:  33%|███▎      | 185/561 [00:26<00:47,  7.84it/s]Qunatization Process:  33%|███▎      | 187/561 [00:27<00:41,  8.96it/s]Qunatization Process:  34%|███▎      | 189/561 [00:27<00:36, 10.08it/s]Qunatization Process:  34%|███▍      | 191/561 [00:27<00:51,  7.22it/s]Qunatization Process:  34%|███▍      | 193/561 [00:28<01:04,  5.74it/s]Qunatization Process:  35%|███▍      | 195/561 [00:28<01:12,  5.03it/s]Qunatization Process:  35%|███▌      | 197/561 [00:28<01:00,  5.98it/s]Qunatization Process:  35%|███▌      | 199/561 [00:29<00:51,  7.04it/s]Qunatization Process:  36%|███▌      | 201/561 [00:29<00:45,  7.99it/s]Qunatization Process:  36%|███▌      | 203/561 [00:29<00:40,  8.83it/s]Qunatization Process:  37%|███▋      | 205/561 [00:29<00:55,  6.36it/s]Qunatization Process:  37%|███▋      | 207/561 [00:30<01:06,  5.31it/s]Qunatization Process:  37%|███▋      | 209/561 [00:30<01:13,  4.77it/s]Qunatization Process:  38%|███▊      | 211/561 [00:31<01:00,  5.81it/s]Qunatization Process:  38%|███▊      | 213/561 [00:31<00:50,  6.86it/s]Qunatization Process:  38%|███▊      | 215/561 [00:31<00:44,  7.84it/s]Qunatization Process:  39%|███▊      | 217/561 [00:31<00:40,  8.58it/s]Qunatization Process:  39%|███▉      | 219/561 [00:32<00:54,  6.24it/s]Qunatization Process:  39%|███▉      | 221/561 [00:32<01:04,  5.26it/s]Qunatization Process:  40%|███▉      | 223/561 [00:33<01:10,  4.77it/s]Qunatization Process:  40%|████      | 225/561 [00:33<00:57,  5.82it/s]Qunatization Process:  40%|████      | 227/561 [00:33<00:48,  6.89it/s]Qunatization Process:  41%|████      | 229/561 [00:33<00:41,  7.92it/s]Qunatization Process:  41%|████      | 231/561 [00:33<00:37,  8.73it/s]Qunatization Process:  42%|████▏     | 233/561 [00:34<00:51,  6.38it/s]Qunatization Process:  42%|████▏     | 235/561 [00:34<01:00,  5.36it/s]Qunatization Process:  42%|████▏     | 237/561 [00:35<01:07,  4.83it/s]Qunatization Process:  43%|████▎     | 239/561 [00:35<00:54,  5.88it/s]Qunatization Process:  43%|████▎     | 241/561 [00:35<00:46,  6.93it/s]Qunatization Process:  43%|████▎     | 243/561 [00:35<00:39,  7.95it/s]Qunatization Process:  44%|████▎     | 245/561 [00:36<00:36,  8.60it/s]Qunatization Process:  44%|████▍     | 247/561 [00:36<00:49,  6.31it/s]Qunatization Process:  44%|████▍     | 249/561 [00:37<00:58,  5.31it/s]Qunatization Process:  45%|████▍     | 251/561 [00:37<01:02,  4.98it/s]Qunatization Process:  45%|████▌     | 253/561 [00:37<00:50,  6.08it/s]Qunatization Process:  45%|████▌     | 255/561 [00:37<00:41,  7.33it/s]Qunatization Process:  46%|████▌     | 257/561 [00:38<00:35,  8.56it/s]Qunatization Process:  46%|████▌     | 259/561 [00:38<00:31,  9.64it/s]Qunatization Process:  47%|████▋     | 261/561 [00:38<00:42,  7.06it/s]Qunatization Process:  47%|████▋     | 263/561 [00:39<00:49,  5.96it/s]Qunatization Process:  47%|████▋     | 265/561 [00:39<00:54,  5.40it/s]Qunatization Process:  48%|████▊     | 267/561 [00:39<00:44,  6.63it/s]Qunatization Process:  48%|████▊     | 269/561 [00:39<00:37,  7.87it/s]Qunatization Process:  48%|████▊     | 271/561 [00:39<00:32,  9.00it/s]Qunatization Process:  49%|████▊     | 273/561 [00:40<00:28,  9.93it/s]Qunatization Process:  49%|████▉     | 275/561 [00:40<00:42,  6.71it/s]Qunatization Process:  49%|████▉     | 277/561 [00:41<00:52,  5.43it/s]Qunatization Process:  50%|████▉     | 279/561 [00:41<00:56,  5.03it/s]Qunatization Process:  50%|█████     | 281/561 [00:41<00:45,  6.19it/s]Qunatization Process:  50%|█████     | 283/561 [00:41<00:37,  7.45it/s]Qunatization Process:  51%|█████     | 285/561 [00:42<00:31,  8.69it/s]Qunatization Process:  51%|█████     | 287/561 [00:42<00:27,  9.83it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:42<00:37,  7.17it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:43<00:44,  6.05it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:43<00:49,  5.41it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:43<00:40,  6.63it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:43<00:33,  7.89it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:44<00:28,  9.09it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:44<00:25, 10.16it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:44<00:36,  7.01it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:45<00:45,  5.64it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:45<00:49,  5.13it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:45<00:40,  6.19it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:45<00:33,  7.40it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:46<00:28,  8.62it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:46<00:25,  9.63it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:46<00:34,  7.01it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:47<00:40,  5.95it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:47<00:44,  5.38it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:47<00:36,  6.58it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:47<00:30,  7.78it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:48<00:26,  8.95it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:48<00:23, 10.02it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:48<00:32,  7.12it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:49<00:38,  5.97it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:49<00:42,  5.36it/s]Qunatization Process:  60%|██████    | 337/561 [00:49<00:34,  6.51it/s]Qunatization Process:  60%|██████    | 339/561 [00:49<00:28,  7.75it/s]Qunatization Process:  61%|██████    | 341/561 [00:50<00:24,  8.96it/s]Qunatization Process:  61%|██████    | 343/561 [00:50<00:21, 10.03it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:50<00:29,  7.26it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:51<00:35,  6.04it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:51<00:38,  5.44it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:51<00:31,  6.66it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:51<00:26,  7.90it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:51<00:22,  9.04it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:52<00:20, 10.11it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:52<00:27,  7.31it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:53<00:32,  6.08it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:53<00:36,  5.44it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:53<00:29,  6.60it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:53<00:24,  7.81it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:53<00:21,  9.01it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:54<00:18, 10.03it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:54<00:25,  7.28it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:55<00:30,  6.09it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:55<00:33,  5.46it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:55<00:27,  6.68it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:55<00:22,  7.85it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:55<00:19,  9.03it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:56<00:17, 10.15it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:56<00:23,  7.27it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:56<00:28,  5.98it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:57<00:31,  5.43it/s]Qunatization Process:  70%|███████   | 393/561 [00:57<00:25,  6.67it/s]Qunatization Process:  70%|███████   | 395/561 [00:57<00:21,  7.81it/s]Qunatization Process:  71%|███████   | 397/561 [00:57<00:18,  9.01it/s]Qunatization Process:  71%|███████   | 399/561 [00:57<00:16,  9.97it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:58<00:22,  7.18it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:58<00:26,  6.01it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:59<00:29,  5.36it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:59<00:23,  6.57it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:59<00:19,  7.80it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:59<00:16,  8.98it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:59<00:14, 10.07it/s]Qunatization Process:  74%|███████▍  | 415/561 [01:00<00:20,  7.20it/s]Qunatization Process:  74%|███████▍  | 417/561 [01:00<00:24,  5.91it/s]Qunatization Process:  75%|███████▍  | 419/561 [01:01<00:26,  5.33it/s]Qunatization Process:  75%|███████▌  | 421/561 [01:01<00:22,  6.33it/s]Qunatization Process:  75%|███████▌  | 423/561 [01:01<00:18,  7.52it/s]Qunatization Process:  76%|███████▌  | 425/561 [01:01<00:15,  8.74it/s]Qunatization Process:  76%|███████▌  | 427/561 [01:01<00:13,  9.86it/s]Qunatization Process:  76%|███████▋  | 429/561 [01:02<00:19,  6.92it/s]Qunatization Process:  77%|███████▋  | 431/561 [01:02<00:22,  5.66it/s]Qunatization Process:  77%|███████▋  | 433/561 [01:03<00:24,  5.23it/s]Qunatization Process:  78%|███████▊  | 435/561 [01:03<00:19,  6.45it/s]Qunatization Process:  78%|███████▊  | 437/561 [01:03<00:16,  7.71it/s]Qunatization Process:  78%|███████▊  | 439/561 [01:03<00:13,  8.94it/s]Qunatization Process:  79%|███████▊  | 441/561 [01:03<00:11, 10.07it/s]Qunatization Process:  79%|███████▉  | 443/561 [01:04<00:16,  7.28it/s]Qunatization Process:  79%|███████▉  | 445/561 [01:04<00:19,  6.10it/s]Qunatization Process:  80%|███████▉  | 447/561 [01:05<00:20,  5.49it/s]Qunatization Process:  80%|████████  | 449/561 [01:05<00:16,  6.72it/s]Qunatization Process:  80%|████████  | 451/561 [01:05<00:13,  7.97it/s]Qunatization Process:  81%|████████  | 453/561 [01:05<00:11,  9.17it/s]Qunatization Process:  81%|████████  | 455/561 [01:05<00:10, 10.26it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:06<00:14,  7.31it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:06<00:16,  6.09it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:07<00:18,  5.51it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:07<00:14,  6.65it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:07<00:12,  7.91it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:07<00:10,  9.13it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:07<00:08, 10.23it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:08<00:12,  7.31it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:08<00:14,  6.09it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:09<00:15,  5.45it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:09<00:12,  6.62it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:09<00:10,  7.87it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:09<00:08,  9.11it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:09<00:07, 10.23it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:10<00:10,  7.38it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:10<00:12,  6.15it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:11<00:13,  5.50it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:11<00:10,  6.74it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:11<00:08,  8.01it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:11<00:07,  9.22it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:11<00:06, 10.32it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:12<00:08,  7.40it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:12<00:09,  6.21it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:13<00:10,  5.57it/s]Qunatization Process:  90%|█████████ | 505/561 [01:13<00:08,  6.82it/s]Qunatization Process:  90%|█████████ | 507/561 [01:13<00:06,  8.08it/s]Qunatization Process:  91%|█████████ | 509/561 [01:13<00:05,  9.29it/s]Qunatization Process:  91%|█████████ | 511/561 [01:13<00:04, 10.36it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:14<00:06,  7.42it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:14<00:07,  6.18it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:14<00:08,  5.47it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:15<00:06,  6.67it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:15<00:05,  7.89it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:15<00:04,  9.07it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:15<00:03, 10.13it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:15<00:04,  7.24it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:16<00:05,  6.05it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:16<00:05,  5.41it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:17<00:04,  6.55it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:17<00:03,  7.79it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:17<00:02,  8.97it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:17<00:02, 10.07it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:17<00:02,  7.25it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:18<00:02,  6.08it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:18<00:02,  5.47it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:18<00:02,  6.68it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:19<00:01,  7.94it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:19<00:01,  9.14it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:19<00:00, 10.21it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:19<00:00,  7.32it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:20<00:00,  6.11it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:20<00:00,  5.48it/s]Qunatization Process: 100%|██████████| 561/561 [01:21<00:00,  3.52it/s]Qunatization Process: 100%|██████████| 561/561 [01:21<00:00,  6.86it/s]
+[00:31:51.913800] ## Processing on RANK 7.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.90s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.43s/it]
+[00:33:36.157242] ## Load pretrained from ../checkpoints/mm/lamaQformerv2_13b/finetuned/
+[00:33:48.650186] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.norm.weight, ranks=[0, 1], max_diff=0.0078125.
+[00:33:48.926682] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:48.926861] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.0.ffn_norm.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:33:49.104850] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:49.105024] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.1.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.266345] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.266515] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.2.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.401769] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.attention_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.401935] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.3.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.537991] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:49.538152] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.4.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.683260] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:49.683418] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.5.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:49.847744] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:49.847909] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.6.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:50.029462] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.029638] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.7.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:50.211117] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:50.211283] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.8.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:50.391973] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.392154] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.9.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:50.573790] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.573954] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.10.ffn_norm.weight, ranks=[0, 1], max_diff=0.0009765625.
+[00:33:50.755457] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.755618] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.11.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.937928] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:50.938096] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.12.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.127676] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:51.127844] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.13.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.328047] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:51.328210] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.14.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.525782] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.525944] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.15.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.723471] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.723630] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.16.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.921202] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:51.921363] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.17.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.118955] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.119114] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.18.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.316151] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.316309] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.19.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.513614] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.513778] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.20.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.710989] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:52.711145] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.21.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:52.907140] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:52.907301] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.22.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.102790] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.102950] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.23.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.298983] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:53.299143] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.24.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.495448] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.495606] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.25.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.701641] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.attention_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.701802] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.26.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:53.917825] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:53.917991] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.27.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:54.124700] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:54.124861] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.28.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:54.341347] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:54.341506] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.29.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:54.558759] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:54.558919] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.30.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:54.777753] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:54.777912] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.31.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:54.974252] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:54.974415] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.32.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:55.171572] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:55.171732] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.33.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:55.369296] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:55.369457] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.34.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:55.567976] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:55.568135] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.35.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:55.765849] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:55.766009] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.36.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:55.963372] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:55.963536] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.37.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:56.159093] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:56.159252] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.38.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:56.354837] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.attention_norm.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:33:56.354994] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.layers.39.ffn_norm.weight, ranks=[0, 1], max_diff=0.001953125.
+[00:33:56.355161] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.start_img, ranks=[0, 1], max_diff=0.00390625.
+[00:33:56.355282] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.end_img, ranks=[0, 1], max_diff=0.00390625.
+[00:34:10.941306] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.weight, ranks=[0, 1], max_diff=0.00048828125.
+[00:34:10.974857] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.0.bias, ranks=[0, 1], max_diff=0.000244140625.
+[00:34:10.975010] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.weight, ranks=[0, 1], max_diff=0.00390625.
+[00:34:10.975137] WARNING! Found unequal replicas of non-tensor-parallel params: name=llma.qformer_proj.1.bias, ranks=[0, 1], max_diff=0.0001220703125.
+[00:34:26.317144] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/561 [00:00<?, ?it/s]Qunatization Process:   0%|          | 1/561 [00:00<01:14,  7.53it/s]Qunatization Process:   1%|          | 3/561 [00:00<00:46, 11.92it/s]Qunatization Process:   1%|          | 5/561 [00:00<00:41, 13.28it/s]Qunatization Process:   1%|          | 7/561 [00:00<00:39, 14.08it/s]Qunatization Process:   2%|▏         | 9/561 [00:00<01:01,  8.94it/s]Qunatization Process:   2%|▏         | 11/561 [00:01<01:14,  7.41it/s]Qunatization Process:   2%|▏         | 13/561 [00:01<01:22,  6.62it/s]Qunatization Process:   3%|▎         | 15/561 [00:01<01:07,  8.08it/s]Qunatization Process:   3%|▎         | 17/561 [00:01<00:57,  9.49it/s]Qunatization Process:   3%|▎         | 19/561 [00:02<00:50, 10.74it/s]Qunatization Process:   4%|▎         | 21/561 [00:02<00:45, 11.84it/s]Qunatization Process:   4%|▍         | 23/561 [00:02<01:00,  8.90it/s]Qunatization Process:   4%|▍         | 25/561 [00:02<01:10,  7.60it/s]Qunatization Process:   5%|▍         | 27/561 [00:03<01:17,  6.91it/s]Qunatization Process:   5%|▌         | 29/561 [00:03<01:04,  8.27it/s]Qunatization Process:   6%|▌         | 31/561 [00:03<00:55,  9.59it/s]Qunatization Process:   6%|▌         | 33/561 [00:03<00:48, 10.79it/s]Qunatization Process:   6%|▌         | 35/561 [00:03<00:44, 11.85it/s]Qunatization Process:   7%|▋         | 37/561 [00:04<01:01,  8.57it/s]Qunatization Process:   7%|▋         | 39/561 [00:04<01:12,  7.18it/s]Qunatization Process:   7%|▋         | 41/561 [00:04<01:20,  6.47it/s]Qunatization Process:   8%|▊         | 43/561 [00:04<01:06,  7.83it/s]Qunatization Process:   8%|▊         | 45/561 [00:05<00:56,  9.18it/s]Qunatization Process:   8%|▊         | 47/561 [00:05<00:49, 10.45it/s]Qunatization Process:   9%|▊         | 49/561 [00:05<00:44, 11.58it/s]Qunatization Process:   9%|▉         | 51/561 [00:05<00:59,  8.53it/s]Qunatization Process:   9%|▉         | 53/561 [00:06<01:10,  7.20it/s]Qunatization Process:  10%|▉         | 55/561 [00:06<01:17,  6.50it/s]Qunatization Process:  10%|█         | 57/561 [00:06<01:04,  7.85it/s]Qunatization Process:  11%|█         | 59/561 [00:06<00:54,  9.21it/s]Qunatization Process:  11%|█         | 61/561 [00:06<00:47, 10.46it/s]Qunatization Process:  11%|█         | 63/561 [00:07<00:42, 11.59it/s]Qunatization Process:  12%|█▏        | 65/561 [00:07<01:03,  7.86it/s]Qunatization Process:  12%|█▏        | 67/561 [00:07<01:17,  6.41it/s]Qunatization Process:  12%|█▏        | 69/561 [00:08<01:26,  5.67it/s]Qunatization Process:  13%|█▎        | 71/561 [00:08<01:11,  6.90it/s]Qunatization Process:  13%|█▎        | 73/561 [00:08<00:59,  8.17it/s]Qunatization Process:  13%|█▎        | 75/561 [00:08<00:51,  9.37it/s]Qunatization Process:  14%|█▎        | 77/561 [00:08<00:46, 10.44it/s]Qunatization Process:  14%|█▍        | 79/561 [00:09<01:04,  7.50it/s]Qunatization Process:  14%|█▍        | 81/561 [00:09<01:16,  6.24it/s]Qunatization Process:  15%|█▍        | 83/561 [00:10<01:25,  5.59it/s]Qunatization Process:  15%|█▌        | 85/561 [00:10<01:09,  6.82it/s]Qunatization Process:  16%|█▌        | 87/561 [00:10<00:58,  8.07it/s]Qunatization Process:  16%|█▌        | 89/561 [00:10<00:50,  9.29it/s]Qunatization Process:  16%|█▌        | 91/561 [00:10<00:45, 10.39it/s]Qunatization Process:  17%|█▋        | 93/561 [00:11<01:02,  7.47it/s]Qunatization Process:  17%|█▋        | 95/561 [00:11<01:14,  6.22it/s]Qunatization Process:  17%|█▋        | 97/561 [00:12<01:23,  5.57it/s]Qunatization Process:  18%|█▊        | 99/561 [00:12<01:07,  6.81it/s]Qunatization Process:  18%|█▊        | 101/561 [00:12<00:57,  8.07it/s]Qunatization Process:  18%|█▊        | 103/561 [00:12<00:49,  9.26it/s]Qunatization Process:  19%|█▊        | 105/561 [00:12<00:44, 10.34it/s]Qunatization Process:  19%|█▉        | 107/561 [00:13<01:01,  7.44it/s]Qunatization Process:  19%|█▉        | 109/561 [00:13<01:12,  6.21it/s]Qunatization Process:  20%|█▉        | 111/561 [00:14<01:20,  5.57it/s]Qunatization Process:  20%|██        | 113/561 [00:14<01:05,  6.80it/s]Qunatization Process:  20%|██        | 115/561 [00:14<00:55,  8.08it/s]Qunatization Process:  21%|██        | 117/561 [00:14<00:47,  9.31it/s]Qunatization Process:  21%|██        | 119/561 [00:14<00:42, 10.37it/s]Qunatization Process:  22%|██▏       | 121/561 [00:15<00:58,  7.46it/s]Qunatization Process:  22%|██▏       | 123/561 [00:15<01:10,  6.23it/s]Qunatization Process:  22%|██▏       | 125/561 [00:15<01:18,  5.59it/s]Qunatization Process:  23%|██▎       | 127/561 [00:16<01:03,  6.83it/s]Qunatization Process:  23%|██▎       | 129/561 [00:16<00:53,  8.10it/s]Qunatization Process:  23%|██▎       | 131/561 [00:16<00:46,  9.31it/s]Qunatization Process:  24%|██▎       | 133/561 [00:16<00:41, 10.39it/s]Qunatization Process:  24%|██▍       | 135/561 [00:16<00:57,  7.44it/s]Qunatization Process:  24%|██▍       | 137/561 [00:17<01:08,  6.22it/s]Qunatization Process:  25%|██▍       | 139/561 [00:17<01:15,  5.58it/s]Qunatization Process:  25%|██▌       | 141/561 [00:18<01:01,  6.82it/s]Qunatization Process:  25%|██▌       | 143/561 [00:18<00:51,  8.11it/s]Qunatization Process:  26%|██▌       | 145/561 [00:18<00:44,  9.30it/s]Qunatization Process:  26%|██▌       | 147/561 [00:18<00:39, 10.38it/s]Qunatization Process:  27%|██▋       | 149/561 [00:18<00:55,  7.45it/s]Qunatization Process:  27%|██▋       | 151/561 [00:19<01:05,  6.22it/s]Qunatization Process:  27%|██▋       | 153/561 [00:19<01:13,  5.58it/s]Qunatization Process:  28%|██▊       | 155/561 [00:19<00:59,  6.80it/s]Qunatization Process:  28%|██▊       | 157/561 [00:20<00:50,  8.06it/s]Qunatization Process:  28%|██▊       | 159/561 [00:20<00:43,  9.27it/s]Qunatization Process:  29%|██▊       | 161/561 [00:20<00:38, 10.36it/s]Qunatization Process:  29%|██▉       | 163/561 [00:20<00:53,  7.45it/s]Qunatization Process:  29%|██▉       | 165/561 [00:21<01:03,  6.22it/s]Qunatization Process:  30%|██▉       | 167/561 [00:21<01:10,  5.58it/s]Qunatization Process:  30%|███       | 169/561 [00:21<00:57,  6.81it/s]Qunatization Process:  30%|███       | 171/561 [00:21<00:48,  8.06it/s]Qunatization Process:  31%|███       | 173/561 [00:22<00:41,  9.26it/s]Qunatization Process:  31%|███       | 175/561 [00:22<00:37, 10.41it/s]Qunatization Process:  32%|███▏      | 177/561 [00:22<00:51,  7.47it/s]Qunatization Process:  32%|███▏      | 179/561 [00:23<01:01,  6.23it/s]Qunatization Process:  32%|███▏      | 181/561 [00:23<01:08,  5.55it/s]Qunatization Process:  33%|███▎      | 183/561 [00:23<00:55,  6.77it/s]Qunatization Process:  33%|███▎      | 185/561 [00:23<00:46,  8.00it/s]Qunatization Process:  33%|███▎      | 187/561 [00:23<00:40,  9.19it/s]Qunatization Process:  34%|███▎      | 189/561 [00:24<00:36, 10.25it/s]Qunatization Process:  34%|███▍      | 191/561 [00:24<00:50,  7.36it/s]Qunatization Process:  34%|███▍      | 193/561 [00:25<00:59,  6.14it/s]Qunatization Process:  35%|███▍      | 195/561 [00:25<01:06,  5.49it/s]Qunatization Process:  35%|███▌      | 197/561 [00:25<00:54,  6.71it/s]Qunatization Process:  35%|███▌      | 199/561 [00:25<00:45,  7.95it/s]Qunatization Process:  36%|███▌      | 201/561 [00:25<00:39,  9.09it/s]Qunatization Process:  36%|███▌      | 203/561 [00:26<00:35, 10.19it/s]Qunatization Process:  37%|███▋      | 205/561 [00:26<00:48,  7.36it/s]Qunatization Process:  37%|███▋      | 207/561 [00:26<00:57,  6.14it/s]Qunatization Process:  37%|███▋      | 209/561 [00:27<01:04,  5.49it/s]Qunatization Process:  38%|███▊      | 211/561 [00:27<00:52,  6.72it/s]Qunatization Process:  38%|███▊      | 213/561 [00:27<00:43,  7.97it/s]Qunatization Process:  38%|███▊      | 215/561 [00:27<00:37,  9.14it/s]Qunatization Process:  39%|███▊      | 217/561 [00:27<00:33, 10.21it/s]Qunatization Process:  39%|███▉      | 219/561 [00:28<00:46,  7.33it/s]Qunatization Process:  39%|███▉      | 221/561 [00:28<00:55,  6.10it/s]Qunatization Process:  40%|███▉      | 223/561 [00:29<01:01,  5.48it/s]Qunatization Process:  40%|████      | 225/561 [00:29<00:50,  6.66it/s]Qunatization Process:  40%|████      | 227/561 [00:29<00:42,  7.92it/s]Qunatization Process:  41%|████      | 229/561 [00:29<00:36,  9.13it/s]Qunatization Process:  41%|████      | 231/561 [00:29<00:32, 10.24it/s]Qunatization Process:  42%|████▏     | 233/561 [00:30<00:44,  7.38it/s]Qunatization Process:  42%|████▏     | 235/561 [00:30<00:53,  6.15it/s]Qunatization Process:  42%|████▏     | 237/561 [00:31<00:58,  5.52it/s]Qunatization Process:  43%|████▎     | 239/561 [00:31<00:47,  6.75it/s]Qunatization Process:  43%|████▎     | 241/561 [00:31<00:40,  7.99it/s]Qunatization Process:  43%|████▎     | 243/561 [00:31<00:34,  9.19it/s]Qunatization Process:  44%|████▎     | 245/561 [00:31<00:30, 10.28it/s]Qunatization Process:  44%|████▍     | 247/561 [00:32<00:42,  7.40it/s]Qunatization Process:  44%|████▍     | 249/561 [00:32<00:50,  6.16it/s]Qunatization Process:  45%|████▍     | 251/561 [00:33<00:56,  5.49it/s]Qunatization Process:  45%|████▌     | 253/561 [00:33<00:46,  6.64it/s]Qunatization Process:  45%|████▌     | 255/561 [00:33<00:38,  7.90it/s]Qunatization Process:  46%|████▌     | 257/561 [00:33<00:33,  9.06it/s]Qunatization Process:  46%|████▌     | 259/561 [00:33<00:29, 10.14it/s]Qunatization Process:  47%|████▋     | 261/561 [00:34<00:41,  7.28it/s]Qunatization Process:  47%|████▋     | 263/561 [00:34<00:48,  6.11it/s]Qunatization Process:  47%|████▋     | 265/561 [00:35<00:53,  5.50it/s]Qunatization Process:  48%|████▊     | 267/561 [00:35<00:43,  6.70it/s]Qunatization Process:  48%|████▊     | 269/561 [00:35<00:36,  7.98it/s]Qunatization Process:  48%|████▊     | 271/561 [00:35<00:31,  9.17it/s]Qunatization Process:  49%|████▊     | 273/561 [00:35<00:28, 10.24it/s]Qunatization Process:  49%|████▉     | 275/561 [00:36<00:38,  7.36it/s]Qunatization Process:  49%|████▉     | 277/561 [00:36<00:46,  6.16it/s]Qunatization Process:  50%|████▉     | 279/561 [00:37<00:51,  5.52it/s]Qunatization Process:  50%|█████     | 281/561 [00:37<00:41,  6.71it/s]Qunatization Process:  50%|█████     | 283/561 [00:37<00:34,  7.96it/s]Qunatization Process:  51%|█████     | 285/561 [00:37<00:30,  9.09it/s]Qunatization Process:  51%|█████     | 287/561 [00:37<00:26, 10.20it/s]Qunatization Process:  52%|█████▏    | 289/561 [00:38<00:37,  7.32it/s]Qunatization Process:  52%|█████▏    | 291/561 [00:38<00:44,  6.12it/s]Qunatization Process:  52%|█████▏    | 293/561 [00:38<00:48,  5.51it/s]Qunatization Process:  53%|█████▎    | 295/561 [00:39<00:39,  6.74it/s]Qunatization Process:  53%|█████▎    | 297/561 [00:39<00:32,  8.01it/s]Qunatization Process:  53%|█████▎    | 299/561 [00:39<00:28,  9.21it/s]Qunatization Process:  54%|█████▎    | 301/561 [00:39<00:25, 10.30it/s]Qunatization Process:  54%|█████▍    | 303/561 [00:39<00:34,  7.41it/s]Qunatization Process:  54%|█████▍    | 305/561 [00:40<00:41,  6.16it/s]Qunatization Process:  55%|█████▍    | 307/561 [00:40<00:45,  5.53it/s]Qunatization Process:  55%|█████▌    | 309/561 [00:41<00:37,  6.75it/s]Qunatization Process:  55%|█████▌    | 311/561 [00:41<00:31,  8.00it/s]Qunatization Process:  56%|█████▌    | 313/561 [00:41<00:26,  9.21it/s]Qunatization Process:  56%|█████▌    | 315/561 [00:41<00:23, 10.30it/s]Qunatization Process:  57%|█████▋    | 317/561 [00:41<00:32,  7.40it/s]Qunatization Process:  57%|█████▋    | 319/561 [00:42<00:39,  6.18it/s]Qunatization Process:  57%|█████▋    | 321/561 [00:42<00:43,  5.54it/s]Qunatization Process:  58%|█████▊    | 323/561 [00:42<00:35,  6.78it/s]Qunatization Process:  58%|█████▊    | 325/561 [00:43<00:29,  8.02it/s]Qunatization Process:  58%|█████▊    | 327/561 [00:43<00:25,  9.22it/s]Qunatization Process:  59%|█████▊    | 329/561 [00:43<00:22, 10.31it/s]Qunatization Process:  59%|█████▉    | 331/561 [00:43<00:31,  7.41it/s]Qunatization Process:  59%|█████▉    | 333/561 [00:44<00:36,  6.18it/s]Qunatization Process:  60%|█████▉    | 335/561 [00:44<00:40,  5.55it/s]Qunatization Process:  60%|██████    | 337/561 [00:44<00:32,  6.79it/s]Qunatization Process:  60%|██████    | 339/561 [00:44<00:27,  8.03it/s]Qunatization Process:  61%|██████    | 341/561 [00:45<00:23,  9.23it/s]Qunatization Process:  61%|██████    | 343/561 [00:45<00:21, 10.19it/s]Qunatization Process:  61%|██████▏   | 345/561 [00:45<00:29,  7.36it/s]Qunatization Process:  62%|██████▏   | 347/561 [00:46<00:34,  6.15it/s]Qunatization Process:  62%|██████▏   | 349/561 [00:46<00:38,  5.51it/s]Qunatization Process:  63%|██████▎   | 351/561 [00:46<00:31,  6.73it/s]Qunatization Process:  63%|██████▎   | 353/561 [00:46<00:26,  7.98it/s]Qunatization Process:  63%|██████▎   | 355/561 [00:47<00:22,  9.18it/s]Qunatization Process:  64%|██████▎   | 357/561 [00:47<00:19, 10.21it/s]Qunatization Process:  64%|██████▍   | 359/561 [00:47<00:27,  7.37it/s]Qunatization Process:  64%|██████▍   | 361/561 [00:48<00:32,  6.17it/s]Qunatization Process:  65%|██████▍   | 363/561 [00:48<00:35,  5.54it/s]Qunatization Process:  65%|██████▌   | 365/561 [00:48<00:28,  6.79it/s]Qunatization Process:  65%|██████▌   | 367/561 [00:48<00:24,  8.04it/s]Qunatization Process:  66%|██████▌   | 369/561 [00:48<00:20,  9.24it/s]Qunatization Process:  66%|██████▌   | 371/561 [00:49<00:18, 10.32it/s]Qunatization Process:  66%|██████▋   | 373/561 [00:49<00:25,  7.42it/s]Qunatization Process:  67%|██████▋   | 375/561 [00:50<00:30,  6.20it/s]Qunatization Process:  67%|██████▋   | 377/561 [00:50<00:33,  5.56it/s]Qunatization Process:  68%|██████▊   | 379/561 [00:50<00:26,  6.79it/s]Qunatization Process:  68%|██████▊   | 381/561 [00:50<00:22,  8.05it/s]Qunatization Process:  68%|██████▊   | 383/561 [00:50<00:19,  9.27it/s]Qunatization Process:  69%|██████▊   | 385/561 [00:51<00:16, 10.35it/s]Qunatization Process:  69%|██████▉   | 387/561 [00:51<00:23,  7.43it/s]Qunatization Process:  69%|██████▉   | 389/561 [00:51<00:27,  6.20it/s]Qunatization Process:  70%|██████▉   | 391/561 [00:52<00:30,  5.56it/s]Qunatization Process:  70%|███████   | 393/561 [00:52<00:24,  6.79it/s]Qunatization Process:  70%|███████   | 395/561 [00:52<00:20,  8.06it/s]Qunatization Process:  71%|███████   | 397/561 [00:52<00:17,  9.29it/s]Qunatization Process:  71%|███████   | 399/561 [00:52<00:15, 10.35it/s]Qunatization Process:  71%|███████▏  | 401/561 [00:53<00:21,  7.43it/s]Qunatization Process:  72%|███████▏  | 403/561 [00:53<00:25,  6.20it/s]Qunatization Process:  72%|███████▏  | 405/561 [00:54<00:28,  5.56it/s]Qunatization Process:  73%|███████▎  | 407/561 [00:54<00:22,  6.79it/s]Qunatization Process:  73%|███████▎  | 409/561 [00:54<00:18,  8.06it/s]Qunatization Process:  73%|███████▎  | 411/561 [00:54<00:16,  9.26it/s]Qunatization Process:  74%|███████▎  | 413/561 [00:54<00:14, 10.35it/s]Qunatization Process:  74%|███████▍  | 415/561 [00:55<00:19,  7.42it/s]Qunatization Process:  74%|███████▍  | 417/561 [00:55<00:23,  6.21it/s]Qunatization Process:  75%|███████▍  | 419/561 [00:56<00:25,  5.55it/s]Qunatization Process:  75%|███████▌  | 421/561 [00:56<00:20,  6.76it/s]Qunatization Process:  75%|███████▌  | 423/561 [00:56<00:17,  8.03it/s]Qunatization Process:  76%|███████▌  | 425/561 [00:56<00:14,  9.20it/s]Qunatization Process:  76%|███████▌  | 427/561 [00:56<00:13, 10.31it/s]Qunatization Process:  76%|███████▋  | 429/561 [00:57<00:17,  7.38it/s]Qunatization Process:  77%|███████▋  | 431/561 [00:57<00:21,  6.14it/s]Qunatization Process:  77%|███████▋  | 433/561 [00:58<00:23,  5.50it/s]Qunatization Process:  78%|███████▊  | 435/561 [00:58<00:18,  6.72it/s]Qunatization Process:  78%|███████▊  | 437/561 [00:58<00:15,  7.98it/s]Qunatization Process:  78%|███████▊  | 439/561 [00:58<00:13,  9.18it/s]Qunatization Process:  79%|███████▊  | 441/561 [00:58<00:11, 10.28it/s]Qunatization Process:  79%|███████▉  | 443/561 [00:59<00:15,  7.42it/s]Qunatization Process:  79%|███████▉  | 445/561 [00:59<00:18,  6.19it/s]Qunatization Process:  80%|███████▉  | 447/561 [00:59<00:20,  5.56it/s]Qunatization Process:  80%|████████  | 449/561 [01:00<00:16,  6.76it/s]Qunatization Process:  80%|████████  | 451/561 [01:00<00:13,  8.01it/s]Qunatization Process:  81%|████████  | 453/561 [01:00<00:11,  9.23it/s]Qunatization Process:  81%|████████  | 455/561 [01:00<00:10, 10.31it/s]Qunatization Process:  81%|████████▏ | 457/561 [01:01<00:14,  7.41it/s]Qunatization Process:  82%|████████▏ | 459/561 [01:01<00:16,  6.20it/s]Qunatization Process:  82%|████████▏ | 461/561 [01:01<00:17,  5.56it/s]Qunatization Process:  83%|████████▎ | 463/561 [01:02<00:14,  6.80it/s]Qunatization Process:  83%|████████▎ | 465/561 [01:02<00:11,  8.04it/s]Qunatization Process:  83%|████████▎ | 467/561 [01:02<00:10,  9.25it/s]Qunatization Process:  84%|████████▎ | 469/561 [01:02<00:08, 10.33it/s]Qunatization Process:  84%|████████▍ | 471/561 [01:02<00:12,  7.43it/s]Qunatization Process:  84%|████████▍ | 473/561 [01:03<00:14,  6.20it/s]Qunatization Process:  85%|████████▍ | 475/561 [01:03<00:15,  5.57it/s]Qunatization Process:  85%|████████▌ | 477/561 [01:03<00:12,  6.79it/s]Qunatization Process:  85%|████████▌ | 479/561 [01:04<00:10,  8.05it/s]Qunatization Process:  86%|████████▌ | 481/561 [01:04<00:08,  9.28it/s]Qunatization Process:  86%|████████▌ | 483/561 [01:04<00:07, 10.34it/s]Qunatization Process:  86%|████████▋ | 485/561 [01:04<00:10,  7.43it/s]Qunatization Process:  87%|████████▋ | 487/561 [01:05<00:11,  6.21it/s]Qunatization Process:  87%|████████▋ | 489/561 [01:05<00:12,  5.57it/s]Qunatization Process:  88%|████████▊ | 491/561 [01:05<00:10,  6.80it/s]Qunatization Process:  88%|████████▊ | 493/561 [01:05<00:08,  8.07it/s]Qunatization Process:  88%|████████▊ | 495/561 [01:06<00:07,  9.26it/s]Qunatization Process:  89%|████████▊ | 497/561 [01:06<00:06, 10.33it/s]Qunatization Process:  89%|████████▉ | 499/561 [01:06<00:08,  7.43it/s]Qunatization Process:  89%|████████▉ | 501/561 [01:07<00:09,  6.18it/s]Qunatization Process:  90%|████████▉ | 503/561 [01:07<00:10,  5.55it/s]Qunatization Process:  90%|█████████ | 505/561 [01:07<00:08,  6.69it/s]Qunatization Process:  90%|█████████ | 507/561 [01:07<00:06,  7.95it/s]Qunatization Process:  91%|█████████ | 509/561 [01:08<00:05,  9.19it/s]Qunatization Process:  91%|█████████ | 511/561 [01:08<00:04, 10.28it/s]Qunatization Process:  91%|█████████▏| 513/561 [01:08<00:06,  7.41it/s]Qunatization Process:  92%|█████████▏| 515/561 [01:09<00:07,  6.19it/s]Qunatization Process:  92%|█████████▏| 517/561 [01:09<00:07,  5.55it/s]Qunatization Process:  93%|█████████▎| 519/561 [01:09<00:06,  6.79it/s]Qunatization Process:  93%|█████████▎| 521/561 [01:09<00:04,  8.05it/s]Qunatization Process:  93%|█████████▎| 523/561 [01:09<00:04,  9.26it/s]Qunatization Process:  94%|█████████▎| 525/561 [01:10<00:03, 10.33it/s]Qunatization Process:  94%|█████████▍| 527/561 [01:10<00:04,  7.42it/s]Qunatization Process:  94%|█████████▍| 529/561 [01:10<00:05,  6.20it/s]Qunatization Process:  95%|█████████▍| 531/561 [01:11<00:05,  5.56it/s]Qunatization Process:  95%|█████████▌| 533/561 [01:11<00:04,  6.79it/s]Qunatization Process:  95%|█████████▌| 535/561 [01:11<00:03,  8.06it/s]Qunatization Process:  96%|█████████▌| 537/561 [01:11<00:02,  9.27it/s]Qunatization Process:  96%|█████████▌| 539/561 [01:11<00:02, 10.34it/s]Qunatization Process:  96%|█████████▋| 541/561 [01:12<00:02,  7.44it/s]Qunatization Process:  97%|█████████▋| 543/561 [01:12<00:02,  6.21it/s]Qunatization Process:  97%|█████████▋| 545/561 [01:13<00:02,  5.56it/s]Qunatization Process:  98%|█████████▊| 547/561 [01:13<00:02,  6.79it/s]Qunatization Process:  98%|█████████▊| 549/561 [01:13<00:01,  8.06it/s]Qunatization Process:  98%|█████████▊| 551/561 [01:13<00:01,  9.24it/s]Qunatization Process:  99%|█████████▊| 553/561 [01:13<00:00, 10.30it/s]Qunatization Process:  99%|█████████▉| 555/561 [01:14<00:00,  7.40it/s]Qunatization Process:  99%|█████████▉| 557/561 [01:14<00:00,  6.19it/s]Qunatization Process: 100%|█████████▉| 559/561 [01:15<00:00,  5.54it/s]Qunatization Process: 100%|██████████| 561/561 [01:16<00:00,  3.56it/s]Qunatization Process: 100%|██████████| 561/561 [01:16<00:00,  7.36it/s]
+[00:36:02.400331] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-39): 40 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=5120, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=13824, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+            (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+    )
+    (qformer): Blip2Model(
+      (vision_model): Blip2VisionModel(
+        (embeddings): Blip2VisionEmbeddings(
+          (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
+        )
+        (encoder): Blip2Encoder(
+          (layers): ModuleList(
+            (0-38): 39 x Blip2EncoderLayer(
+              (self_attn): Blip2Attention(
+                (dropout): Dropout(p=0.0, inplace=False)
+                (qkv): Linear(in_features=1408, out_features=4224, bias=True)
+                (projection): Linear(in_features=1408, out_features=1408, bias=True)
+              )
+              (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+              (mlp): Blip2MLP(
+                (activation_fn): GELUActivation()
+                (fc1): Linear(in_features=1408, out_features=6144, bias=True)
+                (fc2): Linear(in_features=6144, out_features=1408, bias=True)
+              )
+              (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+        )
+        (post_layernorm): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+      )
+      (qformer): Blip2QFormerModel(
+        (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (encoder): Blip2QFormerEncoder(
+          (layer): ModuleList(
+            (0): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (1): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (2): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (3): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (4): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (5): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (6): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (7): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (8): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (9): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (10): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (crossattention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=1408, out_features=768, bias=True)
+                  (value): Linear(in_features=1408, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (11): Blip2QFormerLayer(
+              (attention): Blip2QFormerAttention(
+                (attention): Blip2QFormerMultiHeadAttention(
+                  (query): Linear(in_features=768, out_features=768, bias=True)
+                  (key): Linear(in_features=768, out_features=768, bias=True)
+                  (value): Linear(in_features=768, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+                (output): Blip2QFormerSelfOutput(
+                  (dense): Linear(in_features=768, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (intermediate_query): Blip2QFormerIntermediate(
+                (dense): Linear(in_features=768, out_features=3072, bias=True)
+                (intermediate_act_fn): GELUActivation()
+              )
+              (output_query): Blip2QFormerOutput(
+                (dense): Linear(in_features=3072, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+          )
+        )
+      )
+      (language_projection): None
+      (language_model): None
+    )
+    (qformer_proj): Sequential(
+      (0): Linear(in_features=768, out_features=5120, bias=True)
+      (1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
+    )
+  )
+)
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 7, which does not have an explicit index. FSDP will use the current device 7. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 6, which does not have an explicit index. FSDP will use the current device 6. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 3, which does not have an explicit index. FSDP will use the current device 3. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 5, which does not have an explicit index. FSDP will use the current device 5. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 2, which does not have an explicit index. FSDP will use the current device 2. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 4, which does not have an explicit index. FSDP will use the current device 4. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[00:36:04.308633] apply gradient checkpointing
+[00:36:04.401362] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-39): 40 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=5120, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=13824, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+                (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+      )
+      (qformer): Blip2Model(
+        (vision_model): Blip2VisionModel(
+          (embeddings): Blip2VisionEmbeddings(
+            (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
+          )
+          (encoder): Blip2Encoder(
+            (layers): ModuleList(
+              (0-38): 39 x Blip2EncoderLayer(
+                (self_attn): Blip2Attention(
+                  (dropout): Dropout(p=0.0, inplace=False)
+                  (qkv): Linear(in_features=1408, out_features=4224, bias=True)
+                  (projection): Linear(in_features=1408, out_features=1408, bias=True)
+                )
+                (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+                (mlp): Blip2MLP(
+                  (activation_fn): GELUActivation()
+                  (fc1): Linear(in_features=1408, out_features=6144, bias=True)
+                  (fc2): Linear(in_features=6144, out_features=1408, bias=True)
+                )
+                (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+          )
+          (post_layernorm): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
+        )
+        (qformer): Blip2QFormerModel(
+          (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (encoder): Blip2QFormerEncoder(
+            (layer): ModuleList(
+              (0): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (1): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (2): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (3): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (4): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (5): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (6): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (7): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (8): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (9): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (10): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (crossattention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=1408, out_features=768, bias=True)
+                    (value): Linear(in_features=1408, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+              (11): Blip2QFormerLayer(
+                (attention): Blip2QFormerAttention(
+                  (attention): Blip2QFormerMultiHeadAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                  (output): Blip2QFormerSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                    (dropout): Dropout(p=0.1, inplace=False)
+                  )
+                )
+                (intermediate_query): Blip2QFormerIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output_query): Blip2QFormerOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+          )
+        )
+        (language_projection): None
+        (language_model): None
+      )
+      (qformer_proj): Sequential(
+        (0): Linear(in_features=768, out_features=5120, bias=True)
+        (1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+)
+[00:36:04.401424] effective batch size: 128
+[00:36:04.410084] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[00:36:04.410193] read dataset config from configs/data/finetune/mm/alpaca_llava.yaml
+[00:36:04.410910] DATASET CONFIG:
+[00:36:04.410922] {'META': [['../data/alpaca_gpt4_data.json', 'text'], ['../data/llava_instruct_150k_single_turn.json', 'image_text']]}
+[00:36:04.541643] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[00:36:06.224530] ../data/llava_instruct_150k_single_turn.json, typeimage_text: len 361411
+[00:36:06.242624] total length: 413413
+[00:36:06.253923] <data.alpaca.FinetuneDataset object at 0x7f1e971c09a0>
+[00:36:06.281575] Start training for 3 epochs
+[00:36:06.296296] log_dir: ./output_dir
+[00:36:22.985451] Epoch: [0]  [0/3229]  lr: 0.000000  grad_norm: 2.3647 (2.3647)  closs: 1.5947 (1.5947)  time: 16.6883  data: 8.5180  max mem: 36209
+[00:37:03.163928] Epoch: [0]  [10/3229]  lr: 0.000001  grad_norm: 2.2614 (2.1844)  closs: 1.3985 (1.3614)  time: 5.1696  data: 0.7746  max mem: 54683
+[00:37:43.938889] Epoch: [0]  [20/3229]  lr: 0.000002  grad_norm: 2.2614 (2.2117)  closs: 1.4480 (1.4415)  time: 4.0476  data: 0.0002  max mem: 54683
+[00:38:24.104529] Epoch: [0]  [30/3229]  lr: 0.000002  grad_norm: 2.2231 (2.2052)  closs: 1.4753 (1.4253)  time: 4.0470  data: 0.0002  max mem: 54683
+[00:39:05.082555] Epoch: [0]  [40/3229]  lr: 0.000003  grad_norm: 2.1752 (2.1914)  closs: 1.4110 (1.4059)  time: 4.0571  data: 0.0002  max mem: 54683
+[00:39:46.229231] Epoch: [0]  [50/3229]  lr: 0.000004  grad_norm: 2.0509 (2.2594)  closs: 1.4161 (1.4234)  time: 4.1061  data: 0.0002  max mem: 54683
+[00:40:26.733155] Epoch: [0]  [60/3229]  lr: 0.000005  grad_norm: 1.9159 (2.1852)  closs: 1.4598 (1.4133)  time: 4.0824  data: 0.0003  max mem: 54683
+[00:41:07.256178] Epoch: [0]  [70/3229]  lr: 0.000005  grad_norm: 1.7466 (2.1041)  closs: 1.4069 (1.4059)  time: 4.0513  data: 0.0003  max mem: 54683
+[00:41:48.461794] Epoch: [0]  [80/3229]  lr: 0.000006  grad_norm: 1.4995 (2.0192)  closs: 1.3245 (1.3975)  time: 4.0864  data: 0.0002  max mem: 54683
+[00:42:28.626897] Epoch: [0]  [90/3229]  lr: 0.000007  grad_norm: 1.2119 (1.9324)  closs: 1.2737 (1.3830)  time: 4.0684  data: 0.0002  max mem: 54683
+[00:43:09.428679] Epoch: [0]  [100/3229]  lr: 0.000008  grad_norm: 1.1636 (1.8590)  closs: 1.2619 (1.3700)  time: 4.0482  data: 0.0003  max mem: 54683
+[00:43:49.914083] Epoch: [0]  [110/3229]  lr: 0.000009  grad_norm: 1.0306 (1.7801)  closs: 1.2915 (1.3614)  time: 4.0643  data: 0.0003  max mem: 54683
+[00:44:31.063164] Epoch: [0]  [120/3229]  lr: 0.000009  grad_norm: 0.9462 (1.7095)  closs: 1.3081 (1.3546)  time: 4.0816  data: 0.0002  max mem: 54683
+[00:45:11.261841] Epoch: [0]  [130/3229]  lr: 0.000010  grad_norm: 0.9337 (1.6472)  closs: 1.2631 (1.3397)  time: 4.0673  data: 0.0002  max mem: 54683
+[00:45:52.112790] Epoch: [0]  [140/3229]  lr: 0.000011  grad_norm: 0.8573 (1.5931)  closs: 1.2118 (1.3288)  time: 4.0524  data: 0.0002  max mem: 54683
+[00:46:32.944599] Epoch: [0]  [150/3229]  lr: 0.000012  grad_norm: 0.8517 (1.5447)  closs: 1.1676 (1.3182)  time: 4.0841  data: 0.0003  max mem: 54683
+[00:47:14.287016] Epoch: [0]  [160/3229]  lr: 0.000012  grad_norm: 0.8240 (1.4999)  closs: 1.1756 (1.3118)  time: 4.1086  data: 0.0003  max mem: 54683
+[00:47:55.388906] Epoch: [0]  [170/3229]  lr: 0.000013  grad_norm: 0.8136 (1.4618)  closs: 1.1849 (1.3054)  time: 4.1221  data: 0.0003  max mem: 54683
+[00:48:36.512000] Epoch: [0]  [180/3229]  lr: 0.000014  grad_norm: 0.8954 (1.4305)  closs: 1.2078 (1.3010)  time: 4.1112  data: 0.0002  max mem: 54683
+[00:49:17.622250] Epoch: [0]  [190/3229]  lr: 0.000015  grad_norm: 0.8401 (1.3994)  closs: 1.2296 (1.2966)  time: 4.1116  data: 0.0002  max mem: 54683
+[00:49:58.917542] Epoch: [0]  [200/3229]  lr: 0.000015  grad_norm: 0.8401 (1.3746)  closs: 1.2198 (1.2906)  time: 4.1202  data: 0.0002  max mem: 54683
+[00:50:39.721214] Epoch: [0]  [210/3229]  lr: 0.000016  grad_norm: 0.8446 (1.3499)  closs: 1.1936 (1.2863)  time: 4.1049  data: 0.0002  max mem: 54684
+[00:51:20.209941] Epoch: [0]  [220/3229]  lr: 0.000017  grad_norm: 0.8360 (1.3275)  closs: 1.2039 (1.2828)  time: 4.0645  data: 0.0002  max mem: 54684
+[00:52:01.340571] Epoch: [0]  [230/3229]  lr: 0.000018  grad_norm: 0.8406 (1.3077)  closs: 1.2159 (1.2796)  time: 4.0809  data: 0.0002  max mem: 54684
+[00:52:42.293014] Epoch: [0]  [240/3229]  lr: 0.000019  grad_norm: 0.8406 (1.2881)  closs: 1.2119 (1.2756)  time: 4.1041  data: 0.0002  max mem: 54684
+[00:53:22.448171] Epoch: [0]  [250/3229]  lr: 0.000019  grad_norm: 0.8104 (1.2702)  closs: 1.1467 (1.2704)  time: 4.0553  data: 0.0002  max mem: 54684
+[00:54:02.261862] Epoch: [0]  [260/3229]  lr: 0.000020  grad_norm: 0.8084 (1.2538)  closs: 1.1398 (1.2651)  time: 3.9984  data: 0.0002  max mem: 54684
+[00:54:42.733423] Epoch: [0]  [270/3229]  lr: 0.000021  grad_norm: 0.8337 (1.2407)  closs: 1.1322 (1.2590)  time: 4.0142  data: 0.0002  max mem: 54684
+[00:55:23.786823] Epoch: [0]  [280/3229]  lr: 0.000022  grad_norm: 0.8337 (1.2265)  closs: 1.1322 (1.2549)  time: 4.0762  data: 0.0002  max mem: 54684
+[00:56:04.261686] Epoch: [0]  [290/3229]  lr: 0.000022  grad_norm: 0.8146 (1.2134)  closs: 1.1835 (1.2526)  time: 4.0763  data: 0.0002  max mem: 54684
+[00:56:45.065966] Epoch: [0]  [300/3229]  lr: 0.000023  grad_norm: 0.8291 (1.2034)  closs: 1.2046 (1.2503)  time: 4.0639  data: 0.0002  max mem: 54684
+[00:57:25.203092] Epoch: [0]  [310/3229]  lr: 0.000024  grad_norm: 0.8414 (1.1916)  closs: 1.1724 (1.2467)  time: 4.0470  data: 0.0002  max mem: 54684
+[00:58:05.237988] Epoch: [0]  [320/3229]  lr: 0.000025  grad_norm: 0.8399 (1.1806)  closs: 1.1496 (1.2420)  time: 4.0085  data: 0.0002  max mem: 54684
+[00:58:46.046197] Epoch: [0]  [330/3229]  lr: 0.000026  grad_norm: 0.8551 (1.1711)  closs: 1.1496 (1.2392)  time: 4.0421  data: 0.0002  max mem: 54684
+[00:59:27.204738] Epoch: [0]  [340/3229]  lr: 0.000026  grad_norm: 0.8768 (1.1625)  closs: 1.1481 (1.2362)  time: 4.0983  data: 0.0002  max mem: 54684
+[01:00:08.357924] Epoch: [0]  [350/3229]  lr: 0.000027  grad_norm: 0.8572 (1.1541)  closs: 1.1341 (1.2344)  time: 4.1155  data: 0.0002  max mem: 54684
+[01:00:49.082581] Epoch: [0]  [360/3229]  lr: 0.000028  grad_norm: 0.8620 (1.1473)  closs: 1.1321 (1.2311)  time: 4.0938  data: 0.0002  max mem: 54684
+[01:01:29.908089] Epoch: [0]  [370/3229]  lr: 0.000029  grad_norm: 0.9078 (1.1413)  closs: 1.1322 (1.2292)  time: 4.0774  data: 0.0002  max mem: 54684
+[01:02:10.408877] Epoch: [0]  [380/3229]  lr: 0.000029  grad_norm: 0.9115 (1.1385)  closs: 1.1444 (1.2265)  time: 4.0662  data: 0.0002  max mem: 54684
+[01:02:50.910726] Epoch: [0]  [390/3229]  lr: 0.000030  grad_norm: 0.8868 (1.1318)  closs: 1.1205 (1.2241)  time: 4.0501  data: 0.0002  max mem: 54684
+[01:03:32.368355] Epoch: [0]  [400/3229]  lr: 0.000031  grad_norm: 0.8768 (1.1270)  closs: 1.1205 (1.2212)  time: 4.0979  data: 0.0002  max mem: 54684
+[01:04:12.522689] Epoch: [0]  [410/3229]  lr: 0.000032  grad_norm: 0.8617 (1.1200)  closs: 1.1138 (1.2181)  time: 4.0805  data: 0.0002  max mem: 54684
+[01:04:53.670656] Epoch: [0]  [420/3229]  lr: 0.000033  grad_norm: 0.8747 (1.1175)  closs: 1.0839 (1.2160)  time: 4.0650  data: 0.0002  max mem: 54684
+[01:05:34.517336] Epoch: [0]  [430/3229]  lr: 0.000033  grad_norm: 0.9140 (1.1135)  closs: 1.1140 (1.2136)  time: 4.0997  data: 0.0002  max mem: 54684
+[01:06:15.357490] Epoch: [0]  [440/3229]  lr: 0.000034  grad_norm: 0.9000 (1.1083)  closs: 1.1255 (1.2112)  time: 4.0843  data: 0.0002  max mem: 54684
+[01:06:56.510385] Epoch: [0]  [450/3229]  lr: 0.000035  grad_norm: 0.9130 (1.1047)  closs: 1.1443 (1.2102)  time: 4.0996  data: 0.0002  max mem: 54684
+[01:07:37.338507] Epoch: [0]  [460/3229]  lr: 0.000036  grad_norm: 0.8889 (1.0996)  closs: 1.1569 (1.2089)  time: 4.0990  data: 0.0002  max mem: 54684
+[01:08:17.507539] Epoch: [0]  [470/3229]  lr: 0.000036  grad_norm: 0.8632 (1.0962)  closs: 1.1353 (1.2070)  time: 4.0498  data: 0.0002  max mem: 54684
+[01:08:58.904572] Epoch: [0]  [480/3229]  lr: 0.000037  grad_norm: 0.9061 (1.0920)  closs: 1.0907 (1.2045)  time: 4.0782  data: 0.0002  max mem: 54684
+[01:09:39.746010] Epoch: [0]  [490/3229]  lr: 0.000038  grad_norm: 0.9314 (1.0890)  closs: 1.0907 (1.2028)  time: 4.1118  data: 0.0002  max mem: 54684
+[01:10:20.231136] Epoch: [0]  [500/3229]  lr: 0.000039  grad_norm: 0.9235 (1.0854)  closs: 1.1249 (1.2011)  time: 4.0663  data: 0.0002  max mem: 54684
+[01:11:01.054523] Epoch: [0]  [510/3229]  lr: 0.000039  grad_norm: 0.9309 (1.0834)  closs: 1.1249 (1.1995)  time: 4.0654  data: 0.0002  max mem: 54684
+[01:11:42.308142] Epoch: [0]  [520/3229]  lr: 0.000040  grad_norm: 0.9541 (1.0806)  closs: 1.1226 (1.1972)  time: 4.1038  data: 0.0002  max mem: 54684
+[01:12:23.110317] Epoch: [0]  [530/3229]  lr: 0.000041  grad_norm: 0.9516 (1.0777)  closs: 1.1269 (1.1965)  time: 4.1027  data: 0.0002  max mem: 54684
+[01:13:03.936397] Epoch: [0]  [540/3229]  lr: 0.000042  grad_norm: 0.9204 (1.0814)  closs: 1.1651 (1.1959)  time: 4.0813  data: 0.0002  max mem: 54684
+[01:13:44.208566] Epoch: [0]  [550/3229]  lr: 0.000043  grad_norm: 0.9204 (1.0785)  closs: 1.1192 (1.1931)  time: 4.0548  data: 0.0002  max mem: 54684
+[01:14:25.497954] Epoch: [0]  [560/3229]  lr: 0.000043  grad_norm: 0.9295 (1.0758)  closs: 1.0839 (1.1919)  time: 4.0780  data: 0.0002  max mem: 54684
+[01:15:05.977736] Epoch: [0]  [570/3229]  lr: 0.000044  grad_norm: 0.9723 (1.0751)  closs: 1.1325 (1.1915)  time: 4.0884  data: 0.0002  max mem: 54684
+[01:15:47.138810] Epoch: [0]  [580/3229]  lr: 0.000045  grad_norm: 0.9485 (1.0728)  closs: 1.1764 (1.1910)  time: 4.0820  data: 0.0002  max mem: 54684
+[01:16:28.229266] Epoch: [0]  [590/3229]  lr: 0.000046  grad_norm: 0.9286 (1.0704)  closs: 1.1431 (1.1902)  time: 4.1125  data: 0.0002  max mem: 54684
+[01:17:08.494066] Epoch: [0]  [600/3229]  lr: 0.000046  grad_norm: 0.9124 (1.0680)  closs: 1.1127 (1.1888)  time: 4.0677  data: 0.0002  max mem: 54684
+[01:17:49.333146] Epoch: [0]  [610/3229]  lr: 0.000047  grad_norm: 0.8772 (1.0649)  closs: 1.1387 (1.1877)  time: 4.0551  data: 0.0002  max mem: 54684
+[01:18:30.513096] Epoch: [0]  [620/3229]  lr: 0.000048  grad_norm: 0.8946 (1.0631)  closs: 1.1490 (1.1873)  time: 4.1009  data: 0.0002  max mem: 54684
+[01:19:12.132382] Epoch: [0]  [630/3229]  lr: 0.000049  grad_norm: 0.8910 (1.0597)  closs: 1.1475 (1.1862)  time: 4.1399  data: 0.0002  max mem: 54684
+[01:19:52.876937] Epoch: [0]  [640/3229]  lr: 0.000050  grad_norm: 0.8884 (1.0579)  closs: 1.1173 (1.1843)  time: 4.1181  data: 0.0002  max mem: 54684
+[01:20:33.485645] Epoch: [0]  [650/3229]  lr: 0.000050  grad_norm: 0.8884 (1.0558)  closs: 1.0704 (1.1821)  time: 4.0676  data: 0.0002  max mem: 54684
+[01:21:14.337520] Epoch: [0]  [660/3229]  lr: 0.000050  grad_norm: 0.8974 (1.0531)  closs: 1.0758 (1.1812)  time: 4.0730  data: 0.0002  max mem: 54684
+[01:21:54.945867] Epoch: [0]  [670/3229]  lr: 0.000050  grad_norm: 0.9121 (1.0517)  closs: 1.1193 (1.1797)  time: 4.0729  data: 0.0002  max mem: 54684
+[01:22:36.533925] Epoch: [0]  [680/3229]  lr: 0.000050  grad_norm: 0.9163 (1.0500)  closs: 1.1473 (1.1795)  time: 4.1097  data: 0.0002  max mem: 54684
+[01:23:17.315950] Epoch: [0]  [690/3229]  lr: 0.000050  grad_norm: 0.9097 (1.0480)  closs: 1.1498 (1.1785)  time: 4.1184  data: 0.0002  max mem: 54684
+[01:23:57.791032] Epoch: [0]  [700/3229]  lr: 0.000050  grad_norm: 0.8856 (1.0454)  closs: 1.1322 (1.1770)  time: 4.0628  data: 0.0003  max mem: 54684
+[01:24:38.841536] Epoch: [0]  [710/3229]  lr: 0.000050  grad_norm: 0.8786 (1.0446)  closs: 1.1357 (1.1766)  time: 4.0762  data: 0.0003  max mem: 54684
+[01:25:19.414277] Epoch: [0]  [720/3229]  lr: 0.000050  grad_norm: 0.9131 (1.0427)  closs: 1.1209 (1.1750)  time: 4.0811  data: 0.0002  max mem: 54684
+[01:26:00.560290] Epoch: [0]  [730/3229]  lr: 0.000050  grad_norm: 0.9133 (1.0408)  closs: 1.0723 (1.1737)  time: 4.0859  data: 0.0002  max mem: 54684
+[01:26:41.063904] Epoch: [0]  [740/3229]  lr: 0.000050  grad_norm: 0.8809 (1.0382)  closs: 1.0816 (1.1727)  time: 4.0824  data: 0.0002  max mem: 54684
+[01:27:22.243420] Epoch: [0]  [750/3229]  lr: 0.000050  grad_norm: 0.8721 (1.0366)  closs: 1.1144 (1.1718)  time: 4.0841  data: 0.0002  max mem: 54684
+[01:28:03.061085] Epoch: [0]  [760/3229]  lr: 0.000050  grad_norm: 0.8563 (1.0340)  closs: 1.1227 (1.1713)  time: 4.0998  data: 0.0002  max mem: 54684
+[01:28:43.881750] Epoch: [0]  [770/3229]  lr: 0.000050  grad_norm: 0.8563 (1.0319)  closs: 1.1247 (1.1701)  time: 4.0818  data: 0.0002  max mem: 54684
+[01:29:24.382823] Epoch: [0]  [780/3229]  lr: 0.000050  grad_norm: 0.8666 (1.0297)  closs: 1.0862 (1.1689)  time: 4.0660  data: 0.0003  max mem: 54684
+[01:30:05.263440] Epoch: [0]  [790/3229]  lr: 0.000050  grad_norm: 0.8803 (1.0278)  closs: 1.0773 (1.1675)  time: 4.0690  data: 0.0002  max mem: 54684
+[01:30:46.132360] Epoch: [0]  [800/3229]  lr: 0.000050  grad_norm: 0.8803 (1.0263)  closs: 1.0871 (1.1669)  time: 4.0874  data: 0.0002  max mem: 54684
+[01:31:26.624450] Epoch: [0]  [810/3229]  lr: 0.000050  grad_norm: 0.8782 (1.0245)  closs: 1.0958 (1.1659)  time: 4.0680  data: 0.0002  max mem: 54684
+[01:32:07.126461] Epoch: [0]  [820/3229]  lr: 0.000050  grad_norm: 0.8762 (1.0225)  closs: 1.0958 (1.1649)  time: 4.0496  data: 0.0002  max mem: 54684
+[01:32:48.359187] Epoch: [0]  [830/3229]  lr: 0.000050  grad_norm: 0.8256 (1.0204)  closs: 1.0959 (1.1644)  time: 4.0867  data: 0.0002  max mem: 54684
+[01:33:29.469866] Epoch: [0]  [840/3229]  lr: 0.000050  grad_norm: 0.8423 (1.0185)  closs: 1.1185 (1.1638)  time: 4.1171  data: 0.0002  max mem: 54684
+[01:34:09.970107] Epoch: [0]  [850/3229]  lr: 0.000050  grad_norm: 0.8891 (1.0170)  closs: 1.1185 (1.1627)  time: 4.0805  data: 0.0002  max mem: 54684
+[01:34:50.161303] Epoch: [0]  [860/3229]  lr: 0.000050  grad_norm: 0.8891 (1.0154)  closs: 1.0997 (1.1617)  time: 4.0345  data: 0.0002  max mem: 54684
+[01:35:31.765849] Epoch: [0]  [870/3229]  lr: 0.000050  grad_norm: 0.8328 (1.0135)  closs: 1.0972 (1.1610)  time: 4.0897  data: 0.0002  max mem: 54684
+[01:36:12.904367] Epoch: [0]  [880/3229]  lr: 0.000050  grad_norm: 0.8507 (1.0124)  closs: 1.0972 (1.1607)  time: 4.1371  data: 0.0002  max mem: 54684
+[01:36:54.052581] Epoch: [0]  [890/3229]  lr: 0.000050  grad_norm: 0.8788 (1.0109)  closs: 1.1339 (1.1602)  time: 4.1143  data: 0.0002  max mem: 54684
+[01:37:34.868212] Epoch: [0]  [900/3229]  lr: 0.000050  grad_norm: 0.8585 (1.0094)  closs: 1.1287 (1.1597)  time: 4.0981  data: 0.0002  max mem: 54684
+[01:38:16.113165] Epoch: [0]  [910/3229]  lr: 0.000050  grad_norm: 0.8376 (1.0077)  closs: 1.1211 (1.1589)  time: 4.1030  data: 0.0002  max mem: 54684
+[01:38:56.583668] Epoch: [0]  [920/3229]  lr: 0.000050  grad_norm: 0.8361 (1.0058)  closs: 1.0987 (1.1582)  time: 4.0857  data: 0.0002  max mem: 54684
+[01:39:37.055401] Epoch: [0]  [930/3229]  lr: 0.000050  grad_norm: 0.8472 (1.0044)  closs: 1.1219 (1.1577)  time: 4.0470  data: 0.0002  max mem: 54684
+[01:40:18.207956] Epoch: [0]  [940/3229]  lr: 0.000050  grad_norm: 0.8701 (1.0031)  closs: 1.1219 (1.1571)  time: 4.0811  data: 0.0002  max mem: 54684
+[01:40:59.474520] Epoch: [0]  [950/3229]  lr: 0.000050  grad_norm: 0.8784 (1.0016)  closs: 1.1027 (1.1563)  time: 4.1209  data: 0.0002  max mem: 54684
+[01:41:40.642522] Epoch: [0]  [960/3229]  lr: 0.000050  grad_norm: 0.8791 (1.0010)  closs: 1.0988 (1.1557)  time: 4.1217  data: 0.0002  max mem: 54684
+[01:42:21.476520] Epoch: [0]  [970/3229]  lr: 0.000050  grad_norm: 0.8580 (0.9997)  closs: 1.1107 (1.1550)  time: 4.1000  data: 0.0002  max mem: 54684
+[01:43:01.957616] Epoch: [0]  [980/3229]  lr: 0.000050  grad_norm: 0.8710 (0.9985)  closs: 1.1156 (1.1545)  time: 4.0657  data: 0.0002  max mem: 54684
+[01:43:43.560118] Epoch: [0]  [990/3229]  lr: 0.000050  grad_norm: 0.8756 (0.9976)  closs: 1.1236 (1.1544)  time: 4.1041  data: 0.0002  max mem: 54684
+[01:44:24.011660] Epoch: [0]  [1000/3229]  lr: 0.000050  grad_norm: 0.8893 (0.9967)  closs: 1.1164 (1.1533)  time: 4.1026  data: 0.0002  max mem: 54684
+[01:45:04.795094] Epoch: [0]  [1010/3229]  lr: 0.000050  grad_norm: 0.8719 (0.9951)  closs: 1.0624 (1.1522)  time: 4.0617  data: 0.0002  max mem: 54684
+[01:45:45.580136] Epoch: [0]  [1020/3229]  lr: 0.000050  grad_norm: 0.8502 (0.9936)  closs: 1.0902 (1.1518)  time: 4.0784  data: 0.0002  max mem: 54684
+[01:46:27.178733] Epoch: [0]  [1030/3229]  lr: 0.000050  grad_norm: 0.8502 (0.9924)  closs: 1.1499 (1.1517)  time: 4.1191  data: 0.0002  max mem: 54684
+[01:47:07.978058] Epoch: [0]  [1040/3229]  lr: 0.000050  grad_norm: 0.8731 (0.9909)  closs: 1.0968 (1.1510)  time: 4.1198  data: 0.0002  max mem: 54684
+[01:47:49.138856] Epoch: [0]  [1050/3229]  lr: 0.000050  grad_norm: 0.8731 (0.9900)  closs: 1.1148 (1.1508)  time: 4.0979  data: 0.0002  max mem: 54684
+[01:48:30.427876] Epoch: [0]  [1060/3229]  lr: 0.000050  grad_norm: 0.8918 (0.9889)  closs: 1.1191 (1.1504)  time: 4.1224  data: 0.0002  max mem: 54684
+[01:49:10.546105] Epoch: [0]  [1070/3229]  lr: 0.000050  grad_norm: 0.8288 (0.9871)  closs: 1.0949 (1.1494)  time: 4.0703  data: 0.0002  max mem: 54684
+[01:49:51.703975] Epoch: [0]  [1080/3229]  lr: 0.000050  grad_norm: 0.8567 (0.9861)  closs: 1.1071 (1.1494)  time: 4.0637  data: 0.0002  max mem: 54684
+[01:50:32.529173] Epoch: [0]  [1090/3229]  lr: 0.000050  grad_norm: 0.8706 (0.9851)  closs: 1.1351 (1.1491)  time: 4.0991  data: 0.0002  max mem: 54684
+[01:51:12.582431] Epoch: [0]  [1100/3229]  lr: 0.000050  grad_norm: 0.8475 (0.9835)  closs: 1.0957 (1.1482)  time: 4.0438  data: 0.0002  max mem: 54684
+[01:51:53.016781] Epoch: [0]  [1110/3229]  lr: 0.000050  grad_norm: 0.8386 (0.9821)  closs: 1.0904 (1.1474)  time: 4.0243  data: 0.0002  max mem: 54684
+[01:52:33.815856] Epoch: [0]  [1120/3229]  lr: 0.000050  grad_norm: 0.8762 (0.9811)  closs: 1.1067 (1.1470)  time: 4.0616  data: 0.0002  max mem: 54684
+[01:53:14.619418] Epoch: [0]  [1130/3229]  lr: 0.000050  grad_norm: 0.8767 (0.9802)  closs: 1.0902 (1.1461)  time: 4.0801  data: 0.0002  max mem: 54684
+[01:53:55.215391] Epoch: [0]  [1140/3229]  lr: 0.000050  grad_norm: 0.8585 (0.9789)  closs: 1.0626 (1.1455)  time: 4.0699  data: 0.0002  max mem: 54684
+[01:54:36.057646] Epoch: [0]  [1150/3229]  lr: 0.000050  grad_norm: 0.8274 (0.9774)  closs: 1.0626 (1.1444)  time: 4.0718  data: 0.0002  max mem: 54684
+[01:55:16.560302] Epoch: [0]  [1160/3229]  lr: 0.000050  grad_norm: 0.8237 (0.9766)  closs: 1.0592 (1.1436)  time: 4.0672  data: 0.0002  max mem: 54684
+[01:55:57.057967] Epoch: [0]  [1170/3229]  lr: 0.000050  grad_norm: 0.8557 (0.9756)  closs: 1.0592 (1.1425)  time: 4.0499  data: 0.0002  max mem: 54684
+[01:56:37.362333] Epoch: [0]  [1180/3229]  lr: 0.000050  grad_norm: 0.8549 (0.9744)  closs: 1.0497 (1.1417)  time: 4.0400  data: 0.0002  max mem: 54684
+[01:57:18.113945] Epoch: [0]  [1190/3229]  lr: 0.000050  grad_norm: 0.8588 (0.9733)  closs: 1.0712 (1.1410)  time: 4.0527  data: 0.0002  max mem: 54684
+[01:57:58.946014] Epoch: [0]  [1200/3229]  lr: 0.000050  grad_norm: 0.8545 (0.9721)  closs: 1.1016 (1.1406)  time: 4.0791  data: 0.0002  max mem: 54684
+[01:58:39.470964] Epoch: [0]  [1210/3229]  lr: 0.000050  grad_norm: 0.8155 (0.9705)  closs: 1.0832 (1.1398)  time: 4.0678  data: 0.0002  max mem: 54684
+[01:59:20.231476] Epoch: [0]  [1220/3229]  lr: 0.000050  grad_norm: 0.8030 (0.9693)  closs: 1.0567 (1.1392)  time: 4.0642  data: 0.0002  max mem: 54684
+[02:00:01.565697] Epoch: [0]  [1230/3229]  lr: 0.000050  grad_norm: 0.8324 (0.9682)  closs: 1.1310 (1.1394)  time: 4.1047  data: 0.0002  max mem: 54684
+[02:00:42.411163] Epoch: [0]  [1240/3229]  lr: 0.000050  grad_norm: 0.8455 (0.9676)  closs: 1.1277 (1.1390)  time: 4.1089  data: 0.0002  max mem: 54684
+[02:01:22.913104] Epoch: [0]  [1250/3229]  lr: 0.000050  grad_norm: 0.8509 (0.9664)  closs: 1.0848 (1.1386)  time: 4.0673  data: 0.0002  max mem: 54684
+[02:02:03.681895] Epoch: [0]  [1260/3229]  lr: 0.000049  grad_norm: 0.8150 (0.9655)  closs: 1.0657 (1.1378)  time: 4.0635  data: 0.0002  max mem: 54684
+[02:02:44.382490] Epoch: [0]  [1270/3229]  lr: 0.000049  grad_norm: 0.8150 (0.9645)  closs: 1.0536 (1.1371)  time: 4.0734  data: 0.0002  max mem: 54684
+[02:03:25.548675] Epoch: [0]  [1280/3229]  lr: 0.000049  grad_norm: 0.8142 (0.9634)  closs: 1.0764 (1.1366)  time: 4.0933  data: 0.0002  max mem: 54684
+[02:04:05.764704] Epoch: [0]  [1290/3229]  lr: 0.000049  grad_norm: 0.8104 (0.9624)  closs: 1.0764 (1.1362)  time: 4.0690  data: 0.0002  max mem: 54684
+[02:04:46.308118] Epoch: [0]  [1300/3229]  lr: 0.000049  grad_norm: 0.8170 (0.9614)  closs: 1.0782 (1.1357)  time: 4.0379  data: 0.0002  max mem: 54684
+[02:05:27.268012] Epoch: [0]  [1310/3229]  lr: 0.000049  grad_norm: 0.8101 (0.9602)  closs: 1.0872 (1.1352)  time: 4.0751  data: 0.0002  max mem: 54684
+[02:06:07.751784] Epoch: [0]  [1320/3229]  lr: 0.000049  grad_norm: 0.8024 (0.9591)  closs: 1.0757 (1.1345)  time: 4.0721  data: 0.0002  max mem: 54684
+[02:06:48.603185] Epoch: [0]  [1330/3229]  lr: 0.000049  grad_norm: 0.8059 (0.9579)  closs: 1.0757 (1.1343)  time: 4.0667  data: 0.0002  max mem: 54684
+[02:07:29.989007] Epoch: [0]  [1340/3229]  lr: 0.000049  grad_norm: 0.8320 (0.9575)  closs: 1.1105 (1.1342)  time: 4.1118  data: 0.0002  max mem: 54684
+[02:08:10.751898] Epoch: [0]  [1350/3229]  lr: 0.000049  grad_norm: 0.8769 (0.9566)  closs: 1.0763 (1.1338)  time: 4.1074  data: 0.0002  max mem: 54684
+[02:08:51.277537] Epoch: [0]  [1360/3229]  lr: 0.000049  grad_norm: 0.8228 (0.9556)  closs: 1.0611 (1.1332)  time: 4.0644  data: 0.0002  max mem: 54684
+[02:09:32.452759] Epoch: [0]  [1370/3229]  lr: 0.000049  grad_norm: 0.8208 (0.9549)  closs: 1.0753 (1.1329)  time: 4.0850  data: 0.0002  max mem: 54684
+[02:10:13.889635] Epoch: [0]  [1380/3229]  lr: 0.000049  grad_norm: 0.8204 (0.9543)  closs: 1.0725 (1.1325)  time: 4.1305  data: 0.0002  max mem: 54684
+[02:10:54.494775] Epoch: [0]  [1390/3229]  lr: 0.000049  grad_norm: 0.8187 (0.9533)  closs: 1.0725 (1.1321)  time: 4.1020  data: 0.0002  max mem: 54684
+[02:11:35.317556] Epoch: [0]  [1400/3229]  lr: 0.000049  grad_norm: 0.8187 (0.9522)  closs: 1.0902 (1.1317)  time: 4.0713  data: 0.0002  max mem: 54684
+[02:12:16.125134] Epoch: [0]  [1410/3229]  lr: 0.000049  grad_norm: 0.7985 (0.9512)  closs: 1.0906 (1.1314)  time: 4.0814  data: 0.0002  max mem: 54684
+[02:12:57.277533] Epoch: [0]  [1420/3229]  lr: 0.000049  grad_norm: 0.8158 (0.9501)  closs: 1.1104 (1.1314)  time: 4.0979  data: 0.0002  max mem: 54684
+[02:13:37.904443] Epoch: [0]  [1430/3229]  lr: 0.000049  grad_norm: 0.8008 (0.9488)  closs: 1.0989 (1.1309)  time: 4.0889  data: 0.0002  max mem: 54684
+[02:14:18.743491] Epoch: [0]  [1440/3229]  lr: 0.000049  grad_norm: 0.8008 (0.9479)  closs: 1.0963 (1.1307)  time: 4.0732  data: 0.0002  max mem: 54684
+[02:14:59.571598] Epoch: [0]  [1450/3229]  lr: 0.000049  grad_norm: 0.8153 (0.9467)  closs: 1.1322 (1.1308)  time: 4.0833  data: 0.0002  max mem: 54684
+[02:15:40.772886] Epoch: [0]  [1460/3229]  lr: 0.000049  grad_norm: 0.8315 (0.9465)  closs: 1.1439 (1.1306)  time: 4.1014  data: 0.0002  max mem: 54684
+[02:16:21.736547] Epoch: [0]  [1470/3229]  lr: 0.000049  grad_norm: 0.8342 (0.9457)  closs: 1.0850 (1.1301)  time: 4.1082  data: 0.0002  max mem: 54684
+[02:17:02.910967] Epoch: [0]  [1480/3229]  lr: 0.000049  grad_norm: 0.7991 (0.9446)  closs: 1.0827 (1.1298)  time: 4.1068  data: 0.0002  max mem: 54684
+[02:17:44.083251] Epoch: [0]  [1490/3229]  lr: 0.000049  grad_norm: 0.7991 (0.9442)  closs: 1.1022 (1.1297)  time: 4.1173  data: 0.0002  max mem: 54684
+[02:18:26.042677] Epoch: [0]  [1500/3229]  lr: 0.000049  grad_norm: 0.8720 (0.9438)  closs: 1.1096 (1.1297)  time: 4.1565  data: 0.0002  max mem: 54684
+[02:19:06.874119] Epoch: [0]  [1510/3229]  lr: 0.000049  grad_norm: 0.8466 (0.9430)  closs: 1.0920 (1.1293)  time: 4.1395  data: 0.0002  max mem: 54684
+[02:19:48.004002] Epoch: [0]  [1520/3229]  lr: 0.000049  grad_norm: 0.8431 (0.9425)  closs: 1.0898 (1.1291)  time: 4.0980  data: 0.0002  max mem: 54684
+[02:20:29.271406] Epoch: [0]  [1530/3229]  lr: 0.000049  grad_norm: 0.8229 (0.9419)  closs: 1.0907 (1.1289)  time: 4.1198  data: 0.0002  max mem: 54684
+[02:21:10.763947] Epoch: [0]  [1540/3229]  lr: 0.000049  grad_norm: 0.8301 (0.9413)  closs: 1.1024 (1.1285)  time: 4.1379  data: 0.0002  max mem: 54684
+[02:21:50.930108] Epoch: [0]  [1550/3229]  lr: 0.000049  grad_norm: 0.8301 (0.9403)  closs: 1.0550 (1.1277)  time: 4.0829  data: 0.0002  max mem: 54684
+[02:22:31.739856] Epoch: [0]  [1560/3229]  lr: 0.000049  grad_norm: 0.7819 (0.9392)  closs: 1.0473 (1.1272)  time: 4.0487  data: 0.0002  max mem: 54684
+[02:23:12.906228] Epoch: [0]  [1570/3229]  lr: 0.000049  grad_norm: 0.7600 (0.9384)  closs: 1.0405 (1.1268)  time: 4.0987  data: 0.0002  max mem: 54684
+[02:23:54.080302] Epoch: [0]  [1580/3229]  lr: 0.000049  grad_norm: 0.8070 (0.9376)  closs: 1.0689 (1.1264)  time: 4.1169  data: 0.0002  max mem: 54684
+[02:24:34.900214] Epoch: [0]  [1590/3229]  lr: 0.000049  grad_norm: 0.8137 (0.9367)  closs: 1.0689 (1.1259)  time: 4.0996  data: 0.0002  max mem: 54684
+[02:25:14.745725] Epoch: [0]  [1600/3229]  lr: 0.000049  grad_norm: 0.7806 (0.9354)  closs: 1.0465 (1.1251)  time: 4.0332  data: 0.0002  max mem: 54684
+[02:25:55.704023] Epoch: [0]  [1610/3229]  lr: 0.000049  grad_norm: 0.8074 (0.9348)  closs: 1.0935 (1.1250)  time: 4.0401  data: 0.0002  max mem: 54684
+[02:26:37.197949] Epoch: [0]  [1620/3229]  lr: 0.000049  grad_norm: 0.8162 (0.9342)  closs: 1.1148 (1.1250)  time: 4.1225  data: 0.0002  max mem: 54684
+[02:27:18.326238] Epoch: [0]  [1630/3229]  lr: 0.000049  grad_norm: 0.8162 (0.9335)  closs: 1.0970 (1.1247)  time: 4.1310  data: 0.0002  max mem: 54684
+[02:27:59.451089] Epoch: [0]  [1640/3229]  lr: 0.000049  grad_norm: 0.8227 (0.9328)  closs: 1.1007 (1.1248)  time: 4.1126  data: 0.0002  max mem: 54684
+[02:28:40.359589] Epoch: [0]  [1650/3229]  lr: 0.000049  grad_norm: 0.8227 (0.9321)  closs: 1.0987 (1.1244)  time: 4.1016  data: 0.0002  max mem: 54684
+[02:29:21.499384] Epoch: [0]  [1660/3229]  lr: 0.000049  grad_norm: 0.7906 (0.9314)  closs: 1.0815 (1.1243)  time: 4.1023  data: 0.0002  max mem: 54684
+[02:30:01.961500] Epoch: [0]  [1670/3229]  lr: 0.000049  grad_norm: 0.7906 (0.9306)  closs: 1.1051 (1.1242)  time: 4.0800  data: 0.0002  max mem: 54684
+[02:30:43.093943] Epoch: [0]  [1680/3229]  lr: 0.000049  grad_norm: 0.7620 (0.9296)  closs: 1.0947 (1.1241)  time: 4.0797  data: 0.0002  max mem: 54684
+[02:31:23.378880] Epoch: [0]  [1690/3229]  lr: 0.000049  grad_norm: 0.7389 (0.9285)  closs: 1.0770 (1.1237)  time: 4.0708  data: 0.0002  max mem: 54684
+[02:32:04.494682] Epoch: [0]  [1700/3229]  lr: 0.000049  grad_norm: 0.7654 (0.9278)  closs: 1.0655 (1.1234)  time: 4.0700  data: 0.0002  max mem: 54684
+[02:32:44.682897] Epoch: [0]  [1710/3229]  lr: 0.000048  grad_norm: 0.7788 (0.9267)  closs: 1.0655 (1.1229)  time: 4.0651  data: 0.0002  max mem: 54684
+[02:33:25.176480] Epoch: [0]  [1720/3229]  lr: 0.000048  grad_norm: 0.7793 (0.9260)  closs: 1.0609 (1.1226)  time: 4.0340  data: 0.0002  max mem: 54684
+[02:34:06.615615] Epoch: [0]  [1730/3229]  lr: 0.000048  grad_norm: 0.7983 (0.9254)  closs: 1.0603 (1.1224)  time: 4.0966  data: 0.0002  max mem: 54684
+[02:34:47.629024] Epoch: [0]  [1740/3229]  lr: 0.000048  grad_norm: 0.7874 (0.9245)  closs: 1.0761 (1.1222)  time: 4.1226  data: 0.0002  max mem: 54684
+[02:35:28.439728] Epoch: [0]  [1750/3229]  lr: 0.000048  grad_norm: 0.8113 (0.9241)  closs: 1.0861 (1.1220)  time: 4.0911  data: 0.0002  max mem: 54684
+[02:36:09.569880] Epoch: [0]  [1760/3229]  lr: 0.000048  grad_norm: 0.8314 (0.9236)  closs: 1.1081 (1.1218)  time: 4.0970  data: 0.0002  max mem: 54684
+[02:36:50.672861] Epoch: [0]  [1770/3229]  lr: 0.000048  grad_norm: 0.8306 (0.9229)  closs: 1.0888 (1.1214)  time: 4.1116  data: 0.0002  max mem: 54684
+[02:37:31.667284] Epoch: [0]  [1780/3229]  lr: 0.000048  grad_norm: 0.8092 (0.9221)  closs: 1.0832 (1.1211)  time: 4.1048  data: 0.0002  max mem: 54684
+[02:38:12.473138] Epoch: [0]  [1790/3229]  lr: 0.000048  grad_norm: 0.7678 (0.9212)  closs: 1.0395 (1.1207)  time: 4.0899  data: 0.0002  max mem: 54684
+[02:38:52.942058] Epoch: [0]  [1800/3229]  lr: 0.000048  grad_norm: 0.7929 (0.9204)  closs: 1.0356 (1.1204)  time: 4.0637  data: 0.0002  max mem: 54684
+[02:39:33.666135] Epoch: [0]  [1810/3229]  lr: 0.000048  grad_norm: 0.7905 (0.9195)  closs: 1.0807 (1.1202)  time: 4.0596  data: 0.0002  max mem: 54684
+[02:40:14.381280] Epoch: [0]  [1820/3229]  lr: 0.000048  grad_norm: 0.7724 (0.9187)  closs: 1.1055 (1.1200)  time: 4.0719  data: 0.0002  max mem: 54684
+[02:40:55.197221] Epoch: [0]  [1830/3229]  lr: 0.000048  grad_norm: 0.7724 (0.9180)  closs: 1.0897 (1.1199)  time: 4.0765  data: 0.0002  max mem: 54684
+[02:41:35.043279] Epoch: [0]  [1840/3229]  lr: 0.000048  grad_norm: 0.7413 (0.9168)  closs: 1.0573 (1.1195)  time: 4.0330  data: 0.0002  max mem: 54684
+[02:42:16.132889] Epoch: [0]  [1850/3229]  lr: 0.000048  grad_norm: 0.7712 (0.9161)  closs: 1.0468 (1.1191)  time: 4.0467  data: 0.0002  max mem: 54684
+[02:42:56.518513] Epoch: [0]  [1860/3229]  lr: 0.000048  grad_norm: 0.7990 (0.9154)  closs: 1.0714 (1.1188)  time: 4.0737  data: 0.0002  max mem: 54684
+[02:43:37.655055] Epoch: [0]  [1870/3229]  lr: 0.000048  grad_norm: 0.8236 (0.9150)  closs: 1.1020 (1.1187)  time: 4.0760  data: 0.0002  max mem: 54684
+[02:44:18.501800] Epoch: [0]  [1880/3229]  lr: 0.000048  grad_norm: 0.8055 (0.9143)  closs: 1.0773 (1.1183)  time: 4.0991  data: 0.0002  max mem: 54684
+[02:44:59.327626] Epoch: [0]  [1890/3229]  lr: 0.000048  grad_norm: 0.7469 (0.9135)  closs: 1.0612 (1.1180)  time: 4.0836  data: 0.0002  max mem: 54684
+[02:45:40.957982] Epoch: [0]  [1900/3229]  lr: 0.000048  grad_norm: 0.8118 (0.9132)  closs: 1.0947 (1.1179)  time: 4.1227  data: 0.0002  max mem: 54684
+[02:46:21.456520] Epoch: [0]  [1910/3229]  lr: 0.000048  grad_norm: 0.8009 (0.9124)  closs: 1.0516 (1.1173)  time: 4.1064  data: 0.0002  max mem: 54684
+[02:47:01.609291] Epoch: [0]  [1920/3229]  lr: 0.000048  grad_norm: 0.8009 (0.9121)  closs: 1.0265 (1.1168)  time: 4.0325  data: 0.0002  max mem: 54684
+[02:47:43.618438] Epoch: [0]  [1930/3229]  lr: 0.000048  grad_norm: 0.8074 (0.9115)  closs: 1.0395 (1.1167)  time: 4.1080  data: 0.0002  max mem: 54684
+[02:48:24.870145] Epoch: [0]  [1940/3229]  lr: 0.000048  grad_norm: 0.7995 (0.9110)  closs: 1.1143 (1.1167)  time: 4.1630  data: 0.0002  max mem: 54684
+[02:49:06.019794] Epoch: [0]  [1950/3229]  lr: 0.000048  grad_norm: 0.8393 (0.9109)  closs: 1.1143 (1.1166)  time: 4.1200  data: 0.0001  max mem: 54684
+[02:49:46.491553] Epoch: [0]  [1960/3229]  lr: 0.000048  grad_norm: 0.8373 (0.9102)  closs: 1.0866 (1.1165)  time: 4.0810  data: 0.0002  max mem: 54684
+[02:50:27.704222] Epoch: [0]  [1970/3229]  lr: 0.000048  grad_norm: 0.8284 (0.9099)  closs: 1.0866 (1.1163)  time: 4.0841  data: 0.0002  max mem: 54684
+[02:51:08.271766] Epoch: [0]  [1980/3229]  lr: 0.000048  grad_norm: 0.7955 (0.9092)  closs: 1.0799 (1.1161)  time: 4.0890  data: 0.0002  max mem: 54684
+[02:51:49.381940] Epoch: [0]  [1990/3229]  lr: 0.000048  grad_norm: 0.7955 (0.9090)  closs: 1.0799 (1.1159)  time: 4.0838  data: 0.0002  max mem: 54684
+[02:52:29.511376] Epoch: [0]  [2000/3229]  lr: 0.000048  grad_norm: 0.8154 (0.9083)  closs: 1.0683 (1.1154)  time: 4.0619  data: 0.0002  max mem: 54684
+[02:53:10.897602] Epoch: [0]  [2010/3229]  lr: 0.000048  grad_norm: 0.8103 (0.9078)  closs: 1.0683 (1.1152)  time: 4.0757  data: 0.0002  max mem: 54684
+[02:53:51.562695] Epoch: [0]  [2020/3229]  lr: 0.000047  grad_norm: 0.8103 (0.9072)  closs: 1.0827 (1.1151)  time: 4.1025  data: 0.0002  max mem: 54684
+[02:54:31.690739] Epoch: [0]  [2030/3229]  lr: 0.000047  grad_norm: 0.7805 (0.9065)  closs: 1.0595 (1.1145)  time: 4.0396  data: 0.0002  max mem: 54684
+[02:55:12.459763] Epoch: [0]  [2040/3229]  lr: 0.000047  grad_norm: 0.7728 (0.9058)  closs: 1.0613 (1.1143)  time: 4.0448  data: 0.0002  max mem: 54684
+[02:55:54.276314] Epoch: [0]  [2050/3229]  lr: 0.000047  grad_norm: 0.7637 (0.9051)  closs: 1.1000 (1.1144)  time: 4.1292  data: 0.0002  max mem: 54684
+[02:56:35.163977] Epoch: [0]  [2060/3229]  lr: 0.000047  grad_norm: 0.7644 (0.9045)  closs: 1.1134 (1.1143)  time: 4.1351  data: 0.0002  max mem: 54684
+[02:57:15.269544] Epoch: [0]  [2070/3229]  lr: 0.000047  grad_norm: 0.7892 (0.9039)  closs: 1.0588 (1.1138)  time: 4.0496  data: 0.0002  max mem: 54684
+[02:57:56.040287] Epoch: [0]  [2080/3229]  lr: 0.000047  grad_norm: 0.8042 (0.9033)  closs: 1.0447 (1.1136)  time: 4.0438  data: 0.0002  max mem: 54684
+[02:58:36.828734] Epoch: [0]  [2090/3229]  lr: 0.000047  grad_norm: 0.7867 (0.9027)  closs: 1.0914 (1.1133)  time: 4.0779  data: 0.0002  max mem: 54684
+[02:59:17.056084] Epoch: [0]  [2100/3229]  lr: 0.000047  grad_norm: 0.7766 (0.9021)  closs: 1.0796 (1.1131)  time: 4.0507  data: 0.0002  max mem: 54684
+[02:59:57.850948] Epoch: [0]  [2110/3229]  lr: 0.000047  grad_norm: 0.7773 (0.9016)  closs: 1.0748 (1.1129)  time: 4.0510  data: 0.0002  max mem: 54684
+[03:00:38.445171] Epoch: [0]  [2120/3229]  lr: 0.000047  grad_norm: 0.8051 (0.9012)  closs: 1.0689 (1.1126)  time: 4.0694  data: 0.0002  max mem: 54684
+[03:01:19.387745] Epoch: [0]  [2130/3229]  lr: 0.000047  grad_norm: 0.8057 (0.9006)  closs: 1.0761 (1.1126)  time: 4.0768  data: 0.0002  max mem: 54684
+[03:01:59.964804] Epoch: [0]  [2140/3229]  lr: 0.000047  grad_norm: 0.8057 (0.9000)  closs: 1.1012 (1.1124)  time: 4.0759  data: 0.0002  max mem: 54684
+[03:02:40.770931] Epoch: [0]  [2150/3229]  lr: 0.000047  grad_norm: 0.7907 (0.8995)  closs: 1.0643 (1.1121)  time: 4.0691  data: 0.0002  max mem: 54684
+[03:03:21.686342] Epoch: [0]  [2160/3229]  lr: 0.000047  grad_norm: 0.7718 (0.8990)  closs: 1.0844 (1.1122)  time: 4.0860  data: 0.0002  max mem: 54684
+[03:04:02.991052] Epoch: [0]  [2170/3229]  lr: 0.000047  grad_norm: 0.7893 (0.8986)  closs: 1.1509 (1.1124)  time: 4.1109  data: 0.0002  max mem: 54684
+[03:04:44.216654] Epoch: [0]  [2180/3229]  lr: 0.000047  grad_norm: 0.8023 (0.8981)  closs: 1.0995 (1.1123)  time: 4.1264  data: 0.0002  max mem: 54684
+[03:05:24.677881] Epoch: [0]  [2190/3229]  lr: 0.000047  grad_norm: 0.8062 (0.8976)  closs: 1.0876 (1.1122)  time: 4.0843  data: 0.0002  max mem: 54684
+[03:06:05.666908] Epoch: [0]  [2200/3229]  lr: 0.000047  grad_norm: 0.8079 (0.8971)  closs: 1.0597 (1.1120)  time: 4.0724  data: 0.0002  max mem: 54684
+[03:06:46.537094] Epoch: [0]  [2210/3229]  lr: 0.000047  grad_norm: 0.8028 (0.8967)  closs: 1.0597 (1.1120)  time: 4.0929  data: 0.0002  max mem: 54684
+[03:07:27.457503] Epoch: [0]  [2220/3229]  lr: 0.000047  grad_norm: 0.8028 (0.8962)  closs: 1.0845 (1.1117)  time: 4.0895  data: 0.0002  max mem: 54684
+[03:08:08.278999] Epoch: [0]  [2230/3229]  lr: 0.000047  grad_norm: 0.7934 (0.8957)  closs: 1.0507 (1.1114)  time: 4.0870  data: 0.0002  max mem: 54684
+[03:08:48.891856] Epoch: [0]  [2240/3229]  lr: 0.000047  grad_norm: 0.7621 (0.8951)  closs: 1.0717 (1.1111)  time: 4.0717  data: 0.0002  max mem: 54684
+[03:09:29.597430] Epoch: [0]  [2250/3229]  lr: 0.000047  grad_norm: 0.7756 (0.8946)  closs: 1.0717 (1.1110)  time: 4.0659  data: 0.0002  max mem: 54684
+[03:10:10.515536] Epoch: [0]  [2260/3229]  lr: 0.000047  grad_norm: 0.7727 (0.8940)  closs: 1.0694 (1.1108)  time: 4.0811  data: 0.0002  max mem: 54684
+[03:10:51.637059] Epoch: [0]  [2270/3229]  lr: 0.000047  grad_norm: 0.7727 (0.8936)  closs: 1.0694 (1.1106)  time: 4.1019  data: 0.0002  max mem: 54684
+[03:11:32.595121] Epoch: [0]  [2280/3229]  lr: 0.000046  grad_norm: 0.7681 (0.8929)  closs: 1.0451 (1.1103)  time: 4.1039  data: 0.0002  max mem: 54684
+[03:12:13.624396] Epoch: [0]  [2290/3229]  lr: 0.000046  grad_norm: 0.7482 (0.8924)  closs: 1.0431 (1.1103)  time: 4.0993  data: 0.0002  max mem: 54684
+[03:12:54.747566] Epoch: [0]  [2300/3229]  lr: 0.000046  grad_norm: 0.7703 (0.8920)  closs: 1.1258 (1.1103)  time: 4.1076  data: 0.0002  max mem: 54684
+[03:13:34.899825] Epoch: [0]  [2310/3229]  lr: 0.000046  grad_norm: 0.7731 (0.8914)  closs: 1.0817 (1.1101)  time: 4.0637  data: 0.0002  max mem: 54684
+[03:14:15.886367] Epoch: [0]  [2320/3229]  lr: 0.000046  grad_norm: 0.8009 (0.8911)  closs: 1.0688 (1.1099)  time: 4.0569  data: 0.0002  max mem: 54684
+[03:14:56.926589] Epoch: [0]  [2330/3229]  lr: 0.000046  grad_norm: 0.7560 (0.8905)  closs: 1.0458 (1.1094)  time: 4.1013  data: 0.0002  max mem: 54684
+[03:15:37.678425] Epoch: [0]  [2340/3229]  lr: 0.000046  grad_norm: 0.7591 (0.8902)  closs: 1.0475 (1.1094)  time: 4.0895  data: 0.0002  max mem: 54684
+[03:16:18.128561] Epoch: [0]  [2350/3229]  lr: 0.000046  grad_norm: 0.8028 (0.8896)  closs: 1.0808 (1.1091)  time: 4.0600  data: 0.0002  max mem: 54684
+[03:16:59.115639] Epoch: [0]  [2360/3229]  lr: 0.000046  grad_norm: 0.7890 (0.8892)  closs: 1.0613 (1.1090)  time: 4.0718  data: 0.0002  max mem: 54684
+[03:17:39.826484] Epoch: [0]  [2370/3229]  lr: 0.000046  grad_norm: 0.7886 (0.8888)  closs: 1.0808 (1.1091)  time: 4.0848  data: 0.0002  max mem: 54684
+[03:18:20.921523] Epoch: [0]  [2380/3229]  lr: 0.000046  grad_norm: 0.7769 (0.8883)  closs: 1.1109 (1.1091)  time: 4.0902  data: 0.0002  max mem: 54684
+[03:19:01.354222] Epoch: [0]  [2390/3229]  lr: 0.000046  grad_norm: 0.7578 (0.8877)  closs: 1.0823 (1.1089)  time: 4.0763  data: 0.0002  max mem: 54684
+[03:19:42.341834] Epoch: [0]  [2400/3229]  lr: 0.000046  grad_norm: 0.7938 (0.8873)  closs: 1.0722 (1.1086)  time: 4.0710  data: 0.0002  max mem: 54684
+[03:20:23.981102] Epoch: [0]  [2410/3229]  lr: 0.000046  grad_norm: 0.8303 (0.8871)  closs: 1.0663 (1.1085)  time: 4.1313  data: 0.0002  max mem: 54684
+[03:21:04.723017] Epoch: [0]  [2420/3229]  lr: 0.000046  grad_norm: 0.8142 (0.8867)  closs: 1.0663 (1.1083)  time: 4.1190  data: 0.0002  max mem: 54684
+[03:21:45.161994] Epoch: [0]  [2430/3229]  lr: 0.000046  grad_norm: 0.7785 (0.8863)  closs: 1.0653 (1.1080)  time: 4.0590  data: 0.0002  max mem: 54684
+[03:22:26.409460] Epoch: [0]  [2440/3229]  lr: 0.000046  grad_norm: 0.7811 (0.8859)  closs: 1.0728 (1.1080)  time: 4.0843  data: 0.0002  max mem: 54684
+[03:23:07.737120] Epoch: [0]  [2450/3229]  lr: 0.000046  grad_norm: 0.7773 (0.8855)  closs: 1.1207 (1.1079)  time: 4.1287  data: 0.0002  max mem: 54684
+[03:23:48.818206] Epoch: [0]  [2460/3229]  lr: 0.000046  grad_norm: 0.7819 (0.8853)  closs: 1.0838 (1.1078)  time: 4.1204  data: 0.0002  max mem: 54684
+[03:24:29.929209] Epoch: [0]  [2470/3229]  lr: 0.000046  grad_norm: 0.8072 (0.8851)  closs: 1.0518 (1.1075)  time: 4.1095  data: 0.0002  max mem: 54684
+[03:25:10.886008] Epoch: [0]  [2480/3229]  lr: 0.000046  grad_norm: 0.7804 (0.8846)  closs: 1.0573 (1.1073)  time: 4.1033  data: 0.0002  max mem: 54684
+[03:25:52.557958] Epoch: [0]  [2490/3229]  lr: 0.000046  grad_norm: 0.7737 (0.8845)  closs: 1.0763 (1.1072)  time: 4.1314  data: 0.0002  max mem: 54684
+[03:26:33.671349] Epoch: [0]  [2500/3229]  lr: 0.000045  grad_norm: 0.7789 (0.8840)  closs: 1.1176 (1.1073)  time: 4.1392  data: 0.0002  max mem: 54684
+[03:27:14.135334] Epoch: [0]  [2510/3229]  lr: 0.000045  grad_norm: 0.7701 (0.8835)  closs: 1.0906 (1.1070)  time: 4.0788  data: 0.0002  max mem: 54684
+[03:27:55.432584] Epoch: [0]  [2520/3229]  lr: 0.000045  grad_norm: 0.7708 (0.8832)  closs: 1.0723 (1.1069)  time: 4.0880  data: 0.0002  max mem: 54684
+[03:28:36.466155] Epoch: [0]  [2530/3229]  lr: 0.000045  grad_norm: 0.7946 (0.8827)  closs: 1.0810 (1.1069)  time: 4.1165  data: 0.0002  max mem: 54684
+[03:29:17.239455] Epoch: [0]  [2540/3229]  lr: 0.000045  grad_norm: 0.7615 (0.8823)  closs: 1.0746 (1.1067)  time: 4.0903  data: 0.0002  max mem: 54684
+[03:29:58.019807] Epoch: [0]  [2550/3229]  lr: 0.000045  grad_norm: 0.7483 (0.8819)  closs: 1.0516 (1.1065)  time: 4.0776  data: 0.0002  max mem: 54684
+[03:30:39.329898] Epoch: [0]  [2560/3229]  lr: 0.000045  grad_norm: 0.7708 (0.8816)  closs: 1.0516 (1.1063)  time: 4.1045  data: 0.0002  max mem: 54684
+[03:31:20.317679] Epoch: [0]  [2570/3229]  lr: 0.000045  grad_norm: 0.7788 (0.8812)  closs: 1.0652 (1.1062)  time: 4.1148  data: 0.0002  max mem: 54684
+[03:32:01.406782] Epoch: [0]  [2580/3229]  lr: 0.000045  grad_norm: 0.7574 (0.8808)  closs: 1.0688 (1.1062)  time: 4.1038  data: 0.0002  max mem: 54684
+[03:32:41.538400] Epoch: [0]  [2590/3229]  lr: 0.000045  grad_norm: 0.7515 (0.8802)  closs: 1.0843 (1.1059)  time: 4.0610  data: 0.0002  max mem: 54684
+[03:33:22.786415] Epoch: [0]  [2600/3229]  lr: 0.000045  grad_norm: 0.8014 (0.8799)  closs: 1.0804 (1.1058)  time: 4.0689  data: 0.0002  max mem: 54684
+[03:34:03.508058] Epoch: [0]  [2610/3229]  lr: 0.000045  grad_norm: 0.8014 (0.8794)  closs: 1.0795 (1.1055)  time: 4.0984  data: 0.0002  max mem: 54684
+[03:34:43.958992] Epoch: [0]  [2620/3229]  lr: 0.000045  grad_norm: 0.7646 (0.8789)  closs: 1.0806 (1.1054)  time: 4.0586  data: 0.0002  max mem: 54684
+[03:35:25.187896] Epoch: [0]  [2630/3229]  lr: 0.000045  grad_norm: 0.7696 (0.8785)  closs: 1.1043 (1.1055)  time: 4.0839  data: 0.0002  max mem: 54684
+[03:36:06.087296] Epoch: [0]  [2640/3229]  lr: 0.000045  grad_norm: 0.8024 (0.8782)  closs: 1.1064 (1.1054)  time: 4.1063  data: 0.0002  max mem: 54684
+[03:36:46.644106] Epoch: [0]  [2650/3229]  lr: 0.000045  grad_norm: 0.7676 (0.8777)  closs: 1.0427 (1.1051)  time: 4.0727  data: 0.0002  max mem: 54684
+[03:37:27.758262] Epoch: [0]  [2660/3229]  lr: 0.000045  grad_norm: 0.7810 (0.8776)  closs: 1.0405 (1.1049)  time: 4.0835  data: 0.0002  max mem: 54684
+[03:38:08.680275] Epoch: [0]  [2670/3229]  lr: 0.000045  grad_norm: 0.8024 (0.8774)  closs: 1.0546 (1.1047)  time: 4.1017  data: 0.0002  max mem: 54684
+[03:38:49.498507] Epoch: [0]  [2680/3229]  lr: 0.000045  grad_norm: 0.7959 (0.8770)  closs: 1.0579 (1.1046)  time: 4.0869  data: 0.0002  max mem: 54684
+[03:39:30.230437] Epoch: [0]  [2690/3229]  lr: 0.000045  grad_norm: 0.7603 (0.8766)  closs: 1.0688 (1.1044)  time: 4.0774  data: 0.0002  max mem: 54684
+[03:40:10.699217] Epoch: [0]  [2700/3229]  lr: 0.000045  grad_norm: 0.7214 (0.8762)  closs: 1.0769 (1.1043)  time: 4.0600  data: 0.0002  max mem: 54684
+[03:40:51.627601] Epoch: [0]  [2710/3229]  lr: 0.000044  grad_norm: 0.7282 (0.8757)  closs: 1.0952 (1.1042)  time: 4.0698  data: 0.0002  max mem: 54684
+[03:41:32.962290] Epoch: [0]  [2720/3229]  lr: 0.000044  grad_norm: 0.7610 (0.8755)  closs: 1.0952 (1.1042)  time: 4.1131  data: 0.0002  max mem: 54684
+[03:42:13.881942] Epoch: [0]  [2730/3229]  lr: 0.000044  grad_norm: 0.7852 (0.8752)  closs: 1.1198 (1.1041)  time: 4.1127  data: 0.0002  max mem: 54684
+[03:42:54.354106] Epoch: [0]  [2740/3229]  lr: 0.000044  grad_norm: 0.7901 (0.8749)  closs: 1.0710 (1.1039)  time: 4.0695  data: 0.0002  max mem: 54684
+[03:43:35.270721] Epoch: [0]  [2750/3229]  lr: 0.000044  grad_norm: 0.7921 (0.8746)  closs: 1.0635 (1.1037)  time: 4.0694  data: 0.0002  max mem: 54684
+[03:44:16.560153] Epoch: [0]  [2760/3229]  lr: 0.000044  grad_norm: 0.7653 (0.8742)  closs: 1.0744 (1.1036)  time: 4.1102  data: 0.0002  max mem: 54684
+[03:44:57.121862] Epoch: [0]  [2770/3229]  lr: 0.000044  grad_norm: 0.7653 (0.8738)  closs: 1.0760 (1.1034)  time: 4.0925  data: 0.0002  max mem: 54684
+[03:45:37.257683] Epoch: [0]  [2780/3229]  lr: 0.000044  grad_norm: 0.7473 (0.8732)  closs: 1.0355 (1.1030)  time: 4.0348  data: 0.0002  max mem: 54684
+[03:46:17.901019] Epoch: [0]  [2790/3229]  lr: 0.000044  grad_norm: 0.7473 (0.8729)  closs: 1.0248 (1.1028)  time: 4.0389  data: 0.0002  max mem: 54684
+[03:46:58.454326] Epoch: [0]  [2800/3229]  lr: 0.000044  grad_norm: 0.7700 (0.8725)  closs: 1.0784 (1.1027)  time: 4.0598  data: 0.0002  max mem: 54684
+[03:47:39.337936] Epoch: [0]  [2810/3229]  lr: 0.000044  grad_norm: 0.7546 (0.8720)  closs: 1.0590 (1.1026)  time: 4.0718  data: 0.0002  max mem: 54684
+[03:48:20.119515] Epoch: [0]  [2820/3229]  lr: 0.000044  grad_norm: 0.7763 (0.8717)  closs: 1.0590 (1.1024)  time: 4.0832  data: 0.0002  max mem: 54684
+[03:49:01.136368] Epoch: [0]  [2830/3229]  lr: 0.000044  grad_norm: 0.7699 (0.8713)  closs: 1.0643 (1.1023)  time: 4.0899  data: 0.0002  max mem: 54684
+[03:49:41.768883] Epoch: [0]  [2840/3229]  lr: 0.000044  grad_norm: 0.7428 (0.8708)  closs: 1.0609 (1.1022)  time: 4.0824  data: 0.0002  max mem: 54684
+[03:50:22.978721] Epoch: [0]  [2850/3229]  lr: 0.000044  grad_norm: 0.7529 (0.8704)  closs: 1.0452 (1.1021)  time: 4.0921  data: 0.0002  max mem: 54684
+[03:51:03.103441] Epoch: [0]  [2860/3229]  lr: 0.000044  grad_norm: 0.7534 (0.8699)  closs: 1.0985 (1.1020)  time: 4.0667  data: 0.0002  max mem: 54684
+[03:51:44.422934] Epoch: [0]  [2870/3229]  lr: 0.000044  grad_norm: 0.7563 (0.8695)  closs: 1.0943 (1.1019)  time: 4.0721  data: 0.0002  max mem: 54684
+[03:52:25.402059] Epoch: [0]  [2880/3229]  lr: 0.000044  grad_norm: 0.7777 (0.8691)  closs: 1.1125 (1.1020)  time: 4.1149  data: 0.0002  max mem: 54684
+[03:53:06.008254] Epoch: [0]  [2890/3229]  lr: 0.000043  grad_norm: 0.7777 (0.8688)  closs: 1.1062 (1.1018)  time: 4.0792  data: 0.0002  max mem: 54684
+[03:53:47.142690] Epoch: [0]  [2900/3229]  lr: 0.000043  grad_norm: 0.7781 (0.8685)  closs: 1.0867 (1.1017)  time: 4.0870  data: 0.0002  max mem: 54684
+[03:54:28.114121] Epoch: [0]  [2910/3229]  lr: 0.000043  grad_norm: 0.7930 (0.8682)  closs: 1.0739 (1.1015)  time: 4.1052  data: 0.0002  max mem: 54684
+[03:55:09.721076] Epoch: [0]  [2920/3229]  lr: 0.000043  grad_norm: 0.7677 (0.8679)  closs: 1.0869 (1.1015)  time: 4.1289  data: 0.0002  max mem: 54684
+[03:55:50.967003] Epoch: [0]  [2930/3229]  lr: 0.000043  grad_norm: 0.8109 (0.8678)  closs: 1.0987 (1.1014)  time: 4.1426  data: 0.0002  max mem: 54684
+[03:56:31.795882] Epoch: [0]  [2940/3229]  lr: 0.000043  grad_norm: 0.8109 (0.8675)  closs: 1.0884 (1.1015)  time: 4.1037  data: 0.0002  max mem: 54684
+[03:57:13.073814] Epoch: [0]  [2950/3229]  lr: 0.000043  grad_norm: 0.7720 (0.8672)  closs: 1.0852 (1.1014)  time: 4.1053  data: 0.0002  max mem: 54684
+[03:57:54.032832] Epoch: [0]  [2960/3229]  lr: 0.000043  grad_norm: 0.7005 (0.8666)  closs: 1.0610 (1.1012)  time: 4.1118  data: 0.0002  max mem: 54684
+[03:58:34.931320] Epoch: [0]  [2970/3229]  lr: 0.000043  grad_norm: 0.7079 (0.8663)  closs: 1.0292 (1.1009)  time: 4.0928  data: 0.0002  max mem: 54684
+[03:59:16.050440] Epoch: [0]  [2980/3229]  lr: 0.000043  grad_norm: 0.7676 (0.8661)  closs: 1.0698 (1.1009)  time: 4.1008  data: 0.0002  max mem: 54684
+[03:59:57.343357] Epoch: [0]  [2990/3229]  lr: 0.000043  grad_norm: 0.7941 (0.8659)  closs: 1.0382 (1.1006)  time: 4.1205  data: 0.0002  max mem: 54684
+[04:00:38.675134] Epoch: [0]  [3000/3229]  lr: 0.000043  grad_norm: 0.8105 (0.8657)  closs: 1.0089 (1.1003)  time: 4.1312  data: 0.0002  max mem: 54684
+[04:01:19.570077] Epoch: [0]  [3010/3229]  lr: 0.000043  grad_norm: 0.7900 (0.8653)  closs: 1.0515 (1.1003)  time: 4.1113  data: 0.0002  max mem: 54684
+[04:02:00.671184] Epoch: [0]  [3020/3229]  lr: 0.000043  grad_norm: 0.7517 (0.8650)  closs: 1.0734 (1.1003)  time: 4.0997  data: 0.0002  max mem: 54684
+[04:02:40.993797] Epoch: [0]  [3030/3229]  lr: 0.000043  grad_norm: 0.7689 (0.8646)  closs: 1.0596 (1.1000)  time: 4.0711  data: 0.0002  max mem: 54684
+[04:03:22.299784] Epoch: [0]  [3040/3229]  lr: 0.000043  grad_norm: 0.7714 (0.8643)  closs: 1.0596 (1.0998)  time: 4.0814  data: 0.0002  max mem: 54684
+[04:04:03.204281] Epoch: [0]  [3050/3229]  lr: 0.000043  grad_norm: 0.7578 (0.8639)  closs: 1.0455 (1.0997)  time: 4.1105  data: 0.0002  max mem: 54684
+[04:04:44.303035] Epoch: [0]  [3060/3229]  lr: 0.000043  grad_norm: 0.7642 (0.8637)  closs: 1.0497 (1.0996)  time: 4.1001  data: 0.0002  max mem: 54684
+[04:05:25.256193] Epoch: [0]  [3070/3229]  lr: 0.000042  grad_norm: 0.7642 (0.8633)  closs: 1.0778 (1.0996)  time: 4.1025  data: 0.0002  max mem: 54684
+[04:06:05.981901] Epoch: [0]  [3080/3229]  lr: 0.000042  grad_norm: 0.7319 (0.8628)  closs: 1.0596 (1.0994)  time: 4.0839  data: 0.0002  max mem: 54684
+[04:06:46.434233] Epoch: [0]  [3090/3229]  lr: 0.000042  grad_norm: 0.7442 (0.8624)  closs: 1.0718 (1.0993)  time: 4.0588  data: 0.0002  max mem: 54684
+[04:07:27.009193] Epoch: [0]  [3100/3229]  lr: 0.000042  grad_norm: 0.7568 (0.8619)  closs: 1.1194 (1.0992)  time: 4.0513  data: 0.0002  max mem: 54684
+[04:08:07.508250] Epoch: [0]  [3110/3229]  lr: 0.000042  grad_norm: 0.7233 (0.8615)  closs: 1.0551 (1.0991)  time: 4.0536  data: 0.0002  max mem: 54684
+[04:08:48.595599] Epoch: [0]  [3120/3229]  lr: 0.000042  grad_norm: 0.7674 (0.8612)  closs: 1.0551 (1.0989)  time: 4.0793  data: 0.0002  max mem: 54684
+[04:09:28.728428] Epoch: [0]  [3130/3229]  lr: 0.000042  grad_norm: 0.7612 (0.8607)  closs: 1.0629 (1.0987)  time: 4.0609  data: 0.0002  max mem: 54684
+[04:10:08.523858] Epoch: [0]  [3140/3229]  lr: 0.000042  grad_norm: 0.6953 (0.8602)  closs: 1.0381 (1.0985)  time: 3.9964  data: 0.0002  max mem: 54684
+[04:10:49.845722] Epoch: [0]  [3150/3229]  lr: 0.000042  grad_norm: 0.7319 (0.8599)  closs: 1.0381 (1.0984)  time: 4.0558  data: 0.0002  max mem: 54684
+[04:11:30.298486] Epoch: [0]  [3160/3229]  lr: 0.000042  grad_norm: 0.7591 (0.8595)  closs: 1.0370 (1.0981)  time: 4.0887  data: 0.0002  max mem: 54684
+[04:12:11.068402] Epoch: [0]  [3170/3229]  lr: 0.000042  grad_norm: 0.7661 (0.8592)  closs: 1.0615 (1.0980)  time: 4.0611  data: 0.0002  max mem: 54684
+[04:12:51.939751] Epoch: [0]  [3180/3229]  lr: 0.000042  grad_norm: 0.8030 (0.8590)  closs: 1.0806 (1.0979)  time: 4.0820  data: 0.0002  max mem: 54684
+[04:13:32.816037] Epoch: [0]  [3190/3229]  lr: 0.000042  grad_norm: 0.7835 (0.8587)  closs: 1.0738 (1.0978)  time: 4.0873  data: 0.0002  max mem: 54684
+[04:14:14.251177] Epoch: [0]  [3200/3229]  lr: 0.000042  grad_norm: 0.7516 (0.8583)  closs: 1.0760 (1.0977)  time: 4.1155  data: 0.0003  max mem: 54684
+[04:14:55.041953] Epoch: [0]  [3210/3229]  lr: 0.000042  grad_norm: 0.7438 (0.8580)  closs: 1.0863 (1.0976)  time: 4.1112  data: 0.0003  max mem: 54684
+[04:15:36.390692] Epoch: [0]  [3220/3229]  lr: 0.000042  grad_norm: 0.7658 (0.8579)  closs: 1.0767 (1.0975)  time: 4.1069  data: 0.0001  max mem: 54684
+[04:16:09.561812] Epoch: [0] Total time: 3:40:03
+[04:16:09.562785] Averaged stats: lr: 0.000042  grad_norm: 0.7787 (0.8577)  closs: 1.0483 (1.0961)
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[04:16:09.927151] model saved
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[04:16:11.631099] optimizer saved
+[04:16:11.631728] other rank-common saved
+[04:16:11.636736] rank-specific saved
+[04:16:11.651089] log_dir: ./output_dir
+[04:16:23.993611] Epoch: [1]  [0/3229]  lr: 0.000042  grad_norm: 0.8670 (0.8670)  closs: 1.0075 (1.0075)  time: 12.3415  data: 7.9810  max mem: 54684
+[04:17:06.440076] Epoch: [1]  [10/3229]  lr: 0.000041  grad_norm: 0.7389 (0.8121)  closs: 1.0826 (1.0946)  time: 4.9807  data: 0.7257  max mem: 54684
+[04:17:46.869458] Epoch: [1]  [20/3229]  lr: 0.000041  grad_norm: 0.7389 (0.7906)  closs: 1.0572 (1.0652)  time: 4.1437  data: 0.0002  max mem: 54684
+[04:18:28.213204] Epoch: [1]  [30/3229]  lr: 0.000041  grad_norm: 0.7856 (0.7980)  closs: 1.0665 (1.0783)  time: 4.0886  data: 0.0002  max mem: 54684
+[04:19:09.261576] Epoch: [1]  [40/3229]  lr: 0.000041  grad_norm: 0.7665 (0.7884)  closs: 1.0941 (1.0814)  time: 4.1195  data: 0.0002  max mem: 54684
+[04:19:50.348250] Epoch: [1]  [50/3229]  lr: 0.000041  grad_norm: 0.7463 (0.7844)  closs: 1.0827 (1.0739)  time: 4.1067  data: 0.0002  max mem: 54684
+[04:20:30.454474] Epoch: [1]  [60/3229]  lr: 0.000041  grad_norm: 0.7801 (0.7795)  closs: 1.0234 (1.0633)  time: 4.0596  data: 0.0002  max mem: 54684
+[04:21:11.357215] Epoch: [1]  [70/3229]  lr: 0.000041  grad_norm: 0.8025 (0.7812)  closs: 1.0314 (1.0602)  time: 4.0504  data: 0.0002  max mem: 54684
+[04:21:51.356188] Epoch: [1]  [80/3229]  lr: 0.000041  grad_norm: 0.7522 (0.7691)  closs: 1.0226 (1.0522)  time: 4.0450  data: 0.0002  max mem: 54684
+[04:22:33.043953] Epoch: [1]  [90/3229]  lr: 0.000041  grad_norm: 0.7018 (0.7681)  closs: 1.0252 (1.0538)  time: 4.0843  data: 0.0002  max mem: 54684
+[04:23:13.831663] Epoch: [1]  [100/3229]  lr: 0.000041  grad_norm: 0.7546 (0.7674)  closs: 1.0535 (1.0529)  time: 4.1237  data: 0.0002  max mem: 54684
+[04:23:55.181411] Epoch: [1]  [110/3229]  lr: 0.000041  grad_norm: 0.7521 (0.7679)  closs: 1.0461 (1.0515)  time: 4.1068  data: 0.0002  max mem: 54684
+[04:24:35.870155] Epoch: [1]  [120/3229]  lr: 0.000041  grad_norm: 0.7521 (0.7650)  closs: 1.0400 (1.0525)  time: 4.1019  data: 0.0002  max mem: 54684
+[04:25:17.325692] Epoch: [1]  [130/3229]  lr: 0.000041  grad_norm: 0.7635 (0.7660)  closs: 1.0768 (1.0555)  time: 4.1071  data: 0.0002  max mem: 54684
+[04:25:58.111171] Epoch: [1]  [140/3229]  lr: 0.000041  grad_norm: 0.7635 (0.7666)  closs: 1.0801 (1.0569)  time: 4.1120  data: 0.0002  max mem: 54684
+[04:26:38.709088] Epoch: [1]  [150/3229]  lr: 0.000041  grad_norm: 0.7391 (0.7641)  closs: 1.0635 (1.0556)  time: 4.0691  data: 0.0002  max mem: 54684
+[04:27:19.437075] Epoch: [1]  [160/3229]  lr: 0.000041  grad_norm: 0.7284 (0.7647)  closs: 1.0512 (1.0563)  time: 4.0662  data: 0.0002  max mem: 54684
+[04:28:00.075659] Epoch: [1]  [170/3229]  lr: 0.000040  grad_norm: 0.7635 (0.7653)  closs: 1.0589 (1.0533)  time: 4.0683  data: 0.0002  max mem: 54684
+[04:28:40.520775] Epoch: [1]  [180/3229]  lr: 0.000040  grad_norm: 0.7911 (0.7696)  closs: 1.0460 (1.0532)  time: 4.0541  data: 0.0002  max mem: 54684
+[04:29:21.220750] Epoch: [1]  [190/3229]  lr: 0.000040  grad_norm: 0.7859 (0.7694)  closs: 1.0297 (1.0513)  time: 4.0572  data: 0.0002  max mem: 54684
+[04:30:02.479262] Epoch: [1]  [200/3229]  lr: 0.000040  grad_norm: 0.7721 (0.7706)  closs: 1.0362 (1.0522)  time: 4.0979  data: 0.0002  max mem: 54684
+[04:30:43.921995] Epoch: [1]  [210/3229]  lr: 0.000040  grad_norm: 0.8007 (0.7734)  closs: 1.0904 (1.0546)  time: 4.1350  data: 0.0002  max mem: 54684
+[04:31:24.392183] Epoch: [1]  [220/3229]  lr: 0.000040  grad_norm: 0.8126 (0.7731)  closs: 1.0870 (1.0548)  time: 4.0956  data: 0.0002  max mem: 54684
+[04:32:04.710459] Epoch: [1]  [230/3229]  lr: 0.000040  grad_norm: 0.7601 (0.7722)  closs: 1.0787 (1.0528)  time: 4.0394  data: 0.0002  max mem: 54684
+[04:32:45.662972] Epoch: [1]  [240/3229]  lr: 0.000040  grad_norm: 0.7548 (0.7715)  closs: 1.0161 (1.0511)  time: 4.0635  data: 0.0002  max mem: 54684
+[04:33:27.043779] Epoch: [1]  [250/3229]  lr: 0.000040  grad_norm: 0.7622 (0.7718)  closs: 1.0228 (1.0510)  time: 4.1166  data: 0.0002  max mem: 54684
+[04:34:08.159916] Epoch: [1]  [260/3229]  lr: 0.000040  grad_norm: 0.7596 (0.7716)  closs: 1.0571 (1.0517)  time: 4.1248  data: 0.0002  max mem: 54684
+[04:34:49.118869] Epoch: [1]  [270/3229]  lr: 0.000040  grad_norm: 0.7411 (0.7711)  closs: 1.0854 (1.0523)  time: 4.1037  data: 0.0002  max mem: 54684
+[04:35:29.743500] Epoch: [1]  [280/3229]  lr: 0.000040  grad_norm: 0.7411 (0.7708)  closs: 1.0320 (1.0520)  time: 4.0791  data: 0.0002  max mem: 54684
+[04:36:10.896285] Epoch: [1]  [290/3229]  lr: 0.000040  grad_norm: 0.7698 (0.7711)  closs: 1.0320 (1.0519)  time: 4.0888  data: 0.0002  max mem: 54684
+[04:36:50.693561] Epoch: [1]  [300/3229]  lr: 0.000040  grad_norm: 0.7667 (0.7693)  closs: 1.0176 (1.0507)  time: 4.0474  data: 0.0002  max mem: 54684
+[04:37:31.371110] Epoch: [1]  [310/3229]  lr: 0.000040  grad_norm: 0.7233 (0.7689)  closs: 1.0032 (1.0496)  time: 4.0237  data: 0.0002  max mem: 54684
+[04:38:12.621107] Epoch: [1]  [320/3229]  lr: 0.000039  grad_norm: 0.7637 (0.7696)  closs: 1.0102 (1.0494)  time: 4.0963  data: 0.0002  max mem: 54684
+[04:38:53.280696] Epoch: [1]  [330/3229]  lr: 0.000039  grad_norm: 0.7538 (0.7692)  closs: 1.0404 (1.0488)  time: 4.0954  data: 0.0002  max mem: 54684
+[04:39:34.384153] Epoch: [1]  [340/3229]  lr: 0.000039  grad_norm: 0.7223 (0.7678)  closs: 1.0712 (1.0507)  time: 4.0881  data: 0.0002  max mem: 54684
+[04:40:14.745485] Epoch: [1]  [350/3229]  lr: 0.000039  grad_norm: 0.7186 (0.7656)  closs: 1.0880 (1.0494)  time: 4.0732  data: 0.0002  max mem: 54684
+[04:40:55.725764] Epoch: [1]  [360/3229]  lr: 0.000039  grad_norm: 0.7281 (0.7657)  closs: 1.0522 (1.0491)  time: 4.0670  data: 0.0002  max mem: 54684
+[04:41:37.083851] Epoch: [1]  [370/3229]  lr: 0.000039  grad_norm: 0.7441 (0.7654)  closs: 1.0619 (1.0497)  time: 4.1169  data: 0.0002  max mem: 54684
+[04:42:18.173848] Epoch: [1]  [380/3229]  lr: 0.000039  grad_norm: 0.7505 (0.7652)  closs: 1.0880 (1.0510)  time: 4.1223  data: 0.0002  max mem: 54684
+[04:42:58.469739] Epoch: [1]  [390/3229]  lr: 0.000039  grad_norm: 0.7717 (0.7642)  closs: 1.0462 (1.0496)  time: 4.0692  data: 0.0002  max mem: 54684
+[04:43:39.241200] Epoch: [1]  [400/3229]  lr: 0.000039  grad_norm: 0.7749 (0.7649)  closs: 0.9971 (1.0484)  time: 4.0533  data: 0.0002  max mem: 54684
+[04:44:20.120315] Epoch: [1]  [410/3229]  lr: 0.000039  grad_norm: 0.8080 (0.7657)  closs: 1.0498 (1.0489)  time: 4.0825  data: 0.0002  max mem: 54684
+[04:45:00.557566] Epoch: [1]  [420/3229]  lr: 0.000039  grad_norm: 0.7665 (0.7651)  closs: 1.0759 (1.0488)  time: 4.0658  data: 0.0002  max mem: 54684
+[04:45:41.828412] Epoch: [1]  [430/3229]  lr: 0.000039  grad_norm: 0.7265 (0.7650)  closs: 1.0759 (1.0498)  time: 4.0853  data: 0.0002  max mem: 54684
+[04:46:22.509399] Epoch: [1]  [440/3229]  lr: 0.000039  grad_norm: 0.7462 (0.7646)  closs: 1.0742 (1.0497)  time: 4.0975  data: 0.0002  max mem: 54684
+[04:47:03.187153] Epoch: [1]  [450/3229]  lr: 0.000039  grad_norm: 0.7618 (0.7645)  closs: 1.0617 (1.0497)  time: 4.0679  data: 0.0002  max mem: 54684
+[04:47:44.104151] Epoch: [1]  [460/3229]  lr: 0.000039  grad_norm: 0.7541 (0.7639)  closs: 1.0582 (1.0501)  time: 4.0797  data: 0.0002  max mem: 54684
+[04:48:24.641461] Epoch: [1]  [470/3229]  lr: 0.000038  grad_norm: 0.7528 (0.7635)  closs: 1.0582 (1.0494)  time: 4.0727  data: 0.0002  max mem: 54684
+[04:49:05.751178] Epoch: [1]  [480/3229]  lr: 0.000038  grad_norm: 0.7481 (0.7628)  closs: 1.0947 (1.0506)  time: 4.0823  data: 0.0002  max mem: 54684
+[04:49:46.537424] Epoch: [1]  [490/3229]  lr: 0.000038  grad_norm: 0.7552 (0.7629)  closs: 1.0837 (1.0503)  time: 4.0947  data: 0.0002  max mem: 54684
+[04:50:27.314255] Epoch: [1]  [500/3229]  lr: 0.000038  grad_norm: 0.7712 (0.7634)  closs: 1.0294 (1.0500)  time: 4.0781  data: 0.0002  max mem: 54684
+[04:51:08.310823] Epoch: [1]  [510/3229]  lr: 0.000038  grad_norm: 0.8222 (0.7643)  closs: 1.0444 (1.0500)  time: 4.0886  data: 0.0002  max mem: 54684
+[04:51:49.708115] Epoch: [1]  [520/3229]  lr: 0.000038  grad_norm: 0.7931 (0.7644)  closs: 1.0786 (1.0511)  time: 4.1196  data: 0.0002  max mem: 54684
+[04:52:30.929265] Epoch: [1]  [530/3229]  lr: 0.000038  grad_norm: 0.7668 (0.7648)  closs: 1.0823 (1.0516)  time: 4.1309  data: 0.0002  max mem: 54684
+[04:53:11.799214] Epoch: [1]  [540/3229]  lr: 0.000038  grad_norm: 0.7484 (0.7644)  closs: 1.0851 (1.0518)  time: 4.1045  data: 0.0002  max mem: 54684
+[04:53:52.617648] Epoch: [1]  [550/3229]  lr: 0.000038  grad_norm: 0.7576 (0.7642)  closs: 1.0762 (1.0517)  time: 4.0844  data: 0.0002  max mem: 54684
+[04:54:33.511754] Epoch: [1]  [560/3229]  lr: 0.000038  grad_norm: 0.7606 (0.7637)  closs: 1.0481 (1.0516)  time: 4.0856  data: 0.0002  max mem: 54684
+[04:55:14.276726] Epoch: [1]  [570/3229]  lr: 0.000038  grad_norm: 0.7616 (0.7649)  closs: 1.0444 (1.0516)  time: 4.0829  data: 0.0002  max mem: 54684
+[04:55:54.841137] Epoch: [1]  [580/3229]  lr: 0.000038  grad_norm: 0.7616 (0.7646)  closs: 1.0423 (1.0511)  time: 4.0664  data: 0.0002  max mem: 54684
+[04:56:35.646680] Epoch: [1]  [590/3229]  lr: 0.000038  grad_norm: 0.7542 (0.7649)  closs: 1.0440 (1.0512)  time: 4.0684  data: 0.0002  max mem: 54684
+[04:57:16.334209] Epoch: [1]  [600/3229]  lr: 0.000038  grad_norm: 0.7437 (0.7638)  closs: 1.0745 (1.0512)  time: 4.0746  data: 0.0002  max mem: 54684
+[04:57:57.448181] Epoch: [1]  [610/3229]  lr: 0.000038  grad_norm: 0.7439 (0.7642)  closs: 1.0596 (1.0512)  time: 4.0900  data: 0.0002  max mem: 54684
+[04:58:38.714028] Epoch: [1]  [620/3229]  lr: 0.000037  grad_norm: 0.7602 (0.7641)  closs: 1.0594 (1.0513)  time: 4.1189  data: 0.0002  max mem: 54684
+[04:59:19.266620] Epoch: [1]  [630/3229]  lr: 0.000037  grad_norm: 0.7325 (0.7639)  closs: 1.0401 (1.0502)  time: 4.0909  data: 0.0002  max mem: 54684
+[05:00:00.368157] Epoch: [1]  [640/3229]  lr: 0.000037  grad_norm: 0.7513 (0.7638)  closs: 1.0248 (1.0503)  time: 4.0826  data: 0.0002  max mem: 54684
+[05:00:41.160239] Epoch: [1]  [650/3229]  lr: 0.000037  grad_norm: 0.7513 (0.7635)  closs: 1.0358 (1.0502)  time: 4.0946  data: 0.0002  max mem: 54684
+[05:01:22.138831] Epoch: [1]  [660/3229]  lr: 0.000037  grad_norm: 0.7456 (0.7637)  closs: 1.0590 (1.0504)  time: 4.0885  data: 0.0002  max mem: 54684
+[05:02:02.946232] Epoch: [1]  [670/3229]  lr: 0.000037  grad_norm: 0.7457 (0.7634)  closs: 1.0762 (1.0507)  time: 4.0892  data: 0.0002  max mem: 54684
+[05:02:44.502278] Epoch: [1]  [680/3229]  lr: 0.000037  grad_norm: 0.7440 (0.7634)  closs: 1.0678 (1.0509)  time: 4.1181  data: 0.0002  max mem: 54684
+[05:03:24.951117] Epoch: [1]  [690/3229]  lr: 0.000037  grad_norm: 0.7456 (0.7630)  closs: 1.0585 (1.0513)  time: 4.1002  data: 0.0002  max mem: 54684
+[05:04:05.578068] Epoch: [1]  [700/3229]  lr: 0.000037  grad_norm: 0.7712 (0.7629)  closs: 1.0648 (1.0509)  time: 4.0537  data: 0.0002  max mem: 54684
+[05:04:46.382125] Epoch: [1]  [710/3229]  lr: 0.000037  grad_norm: 0.7536 (0.7623)  closs: 1.0831 (1.0508)  time: 4.0715  data: 0.0002  max mem: 54684
+[05:05:27.836191] Epoch: [1]  [720/3229]  lr: 0.000037  grad_norm: 0.7533 (0.7623)  closs: 1.0761 (1.0516)  time: 4.1128  data: 0.0002  max mem: 54684
+[05:06:08.635002] Epoch: [1]  [730/3229]  lr: 0.000037  grad_norm: 0.7805 (0.7629)  closs: 1.0763 (1.0519)  time: 4.1126  data: 0.0002  max mem: 54684
+[05:06:48.634490] Epoch: [1]  [740/3229]  lr: 0.000037  grad_norm: 0.7186 (0.7620)  closs: 1.0197 (1.0513)  time: 4.0398  data: 0.0002  max mem: 54684
+[05:07:29.445391] Epoch: [1]  [750/3229]  lr: 0.000037  grad_norm: 0.7025 (0.7616)  closs: 1.0359 (1.0516)  time: 4.0405  data: 0.0002  max mem: 54684
+[05:08:11.035741] Epoch: [1]  [760/3229]  lr: 0.000036  grad_norm: 0.7647 (0.7623)  closs: 1.0791 (1.0519)  time: 4.1200  data: 0.0002  max mem: 54684
+[05:08:51.808203] Epoch: [1]  [770/3229]  lr: 0.000036  grad_norm: 0.8157 (0.7630)  closs: 1.0874 (1.0524)  time: 4.1181  data: 0.0002  max mem: 54684
+[05:09:32.821381] Epoch: [1]  [780/3229]  lr: 0.000036  grad_norm: 0.7640 (0.7628)  closs: 1.0954 (1.0530)  time: 4.0892  data: 0.0002  max mem: 54684
+[05:10:13.634195] Epoch: [1]  [790/3229]  lr: 0.000036  grad_norm: 0.7289 (0.7624)  closs: 1.0748 (1.0527)  time: 4.0912  data: 0.0002  max mem: 54684
+[05:10:55.053924] Epoch: [1]  [800/3229]  lr: 0.000036  grad_norm: 0.7343 (0.7624)  closs: 1.0321 (1.0528)  time: 4.1116  data: 0.0002  max mem: 54684
+[05:11:35.196793] Epoch: [1]  [810/3229]  lr: 0.000036  grad_norm: 0.7367 (0.7620)  closs: 1.0322 (1.0524)  time: 4.0781  data: 0.0002  max mem: 54684
+[05:12:16.522962] Epoch: [1]  [820/3229]  lr: 0.000036  grad_norm: 0.7401 (0.7621)  closs: 1.0246 (1.0522)  time: 4.0734  data: 0.0002  max mem: 54684
+[05:12:57.137015] Epoch: [1]  [830/3229]  lr: 0.000036  grad_norm: 0.7595 (0.7617)  closs: 1.0450 (1.0524)  time: 4.0969  data: 0.0002  max mem: 54684
+[05:13:38.237630] Epoch: [1]  [840/3229]  lr: 0.000036  grad_norm: 0.7652 (0.7616)  closs: 1.0833 (1.0530)  time: 4.0857  data: 0.0002  max mem: 54684
+[05:14:19.021316] Epoch: [1]  [850/3229]  lr: 0.000036  grad_norm: 0.7652 (0.7614)  closs: 1.0680 (1.0532)  time: 4.0941  data: 0.0002  max mem: 54684
+[05:14:59.987219] Epoch: [1]  [860/3229]  lr: 0.000036  grad_norm: 0.7362 (0.7610)  closs: 1.0479 (1.0534)  time: 4.0874  data: 0.0002  max mem: 54684
+[05:15:40.669534] Epoch: [1]  [870/3229]  lr: 0.000036  grad_norm: 0.7348 (0.7608)  closs: 1.0472 (1.0530)  time: 4.0823  data: 0.0002  max mem: 54684
+[05:16:21.345203] Epoch: [1]  [880/3229]  lr: 0.000036  grad_norm: 0.7769 (0.7607)  closs: 1.0219 (1.0529)  time: 4.0678  data: 0.0002  max mem: 54684
+[05:17:02.174965] Epoch: [1]  [890/3229]  lr: 0.000036  grad_norm: 0.7393 (0.7603)  closs: 1.0763 (1.0534)  time: 4.0752  data: 0.0002  max mem: 54684
+[05:17:43.153839] Epoch: [1]  [900/3229]  lr: 0.000035  grad_norm: 0.7229 (0.7600)  closs: 1.0763 (1.0535)  time: 4.0904  data: 0.0002  max mem: 54684
+[05:18:23.532375] Epoch: [1]  [910/3229]  lr: 0.000035  grad_norm: 0.7346 (0.7597)  closs: 1.0722 (1.0536)  time: 4.0678  data: 0.0002  max mem: 54684
+[05:19:04.844983] Epoch: [1]  [920/3229]  lr: 0.000035  grad_norm: 0.7679 (0.7600)  closs: 1.0613 (1.0536)  time: 4.0845  data: 0.0002  max mem: 54684
+[05:19:45.960496] Epoch: [1]  [930/3229]  lr: 0.000035  grad_norm: 0.7816 (0.7600)  closs: 1.0613 (1.0537)  time: 4.1213  data: 0.0002  max mem: 54684
+[05:20:26.646864] Epoch: [1]  [940/3229]  lr: 0.000035  grad_norm: 0.7816 (0.7601)  closs: 1.0160 (1.0532)  time: 4.0900  data: 0.0002  max mem: 54684
+[05:21:07.563052] Epoch: [1]  [950/3229]  lr: 0.000035  grad_norm: 0.7621 (0.7605)  closs: 0.9814 (1.0528)  time: 4.0801  data: 0.0002  max mem: 54684
+[05:21:48.451416] Epoch: [1]  [960/3229]  lr: 0.000035  grad_norm: 0.7508 (0.7605)  closs: 0.9968 (1.0529)  time: 4.0902  data: 0.0002  max mem: 54684
+[05:22:29.552875] Epoch: [1]  [970/3229]  lr: 0.000035  grad_norm: 0.7519 (0.7605)  closs: 1.0570 (1.0530)  time: 4.0994  data: 0.0002  max mem: 54684
+[05:23:10.530952] Epoch: [1]  [980/3229]  lr: 0.000035  grad_norm: 0.7625 (0.7606)  closs: 1.0570 (1.0532)  time: 4.1039  data: 0.0002  max mem: 54684
+[05:23:51.878010] Epoch: [1]  [990/3229]  lr: 0.000035  grad_norm: 0.7625 (0.7608)  closs: 1.0764 (1.0534)  time: 4.1162  data: 0.0002  max mem: 54684
+[05:24:31.849857] Epoch: [1]  [1000/3229]  lr: 0.000035  grad_norm: 0.7446 (0.7599)  closs: 1.0764 (1.0532)  time: 4.0659  data: 0.0002  max mem: 54684
+[05:25:12.639207] Epoch: [1]  [1010/3229]  lr: 0.000035  grad_norm: 0.7020 (0.7597)  closs: 1.0242 (1.0529)  time: 4.0380  data: 0.0002  max mem: 54684
+[05:25:52.977496] Epoch: [1]  [1020/3229]  lr: 0.000035  grad_norm: 0.7052 (0.7594)  closs: 1.0555 (1.0527)  time: 4.0563  data: 0.0002  max mem: 54684
+[05:26:34.003363] Epoch: [1]  [1030/3229]  lr: 0.000034  grad_norm: 0.7147 (0.7588)  closs: 1.0503 (1.0527)  time: 4.0681  data: 0.0002  max mem: 54684
+[05:27:14.573525] Epoch: [1]  [1040/3229]  lr: 0.000034  grad_norm: 0.7147 (0.7586)  closs: 1.0450 (1.0524)  time: 4.0797  data: 0.0002  max mem: 54684
+[05:27:55.498440] Epoch: [1]  [1050/3229]  lr: 0.000034  grad_norm: 0.7522 (0.7589)  closs: 1.0486 (1.0523)  time: 4.0747  data: 0.0002  max mem: 54684
+[05:28:36.363003] Epoch: [1]  [1060/3229]  lr: 0.000034  grad_norm: 0.7522 (0.7588)  closs: 1.0553 (1.0523)  time: 4.0894  data: 0.0002  max mem: 54684
+[05:29:17.566434] Epoch: [1]  [1070/3229]  lr: 0.000034  grad_norm: 0.7544 (0.7588)  closs: 1.0536 (1.0523)  time: 4.1033  data: 0.0002  max mem: 54684
+[05:29:57.687127] Epoch: [1]  [1080/3229]  lr: 0.000034  grad_norm: 0.7544 (0.7586)  closs: 1.0454 (1.0519)  time: 4.0661  data: 0.0002  max mem: 54684
+[05:30:38.261585] Epoch: [1]  [1090/3229]  lr: 0.000034  grad_norm: 0.7317 (0.7584)  closs: 1.0487 (1.0519)  time: 4.0347  data: 0.0002  max mem: 54684
+[05:31:19.146600] Epoch: [1]  [1100/3229]  lr: 0.000034  grad_norm: 0.7461 (0.7585)  closs: 1.0237 (1.0518)  time: 4.0729  data: 0.0002  max mem: 54684
+[05:32:00.230890] Epoch: [1]  [1110/3229]  lr: 0.000034  grad_norm: 0.7654 (0.7586)  closs: 1.0237 (1.0518)  time: 4.0984  data: 0.0002  max mem: 54684
+[05:32:40.323167] Epoch: [1]  [1120/3229]  lr: 0.000034  grad_norm: 0.7725 (0.7589)  closs: 1.0261 (1.0517)  time: 4.0588  data: 0.0002  max mem: 54684
+[05:33:21.572530] Epoch: [1]  [1130/3229]  lr: 0.000034  grad_norm: 0.7690 (0.7592)  closs: 1.0308 (1.0517)  time: 4.0670  data: 0.0002  max mem: 54684
+[05:34:02.762302] Epoch: [1]  [1140/3229]  lr: 0.000034  grad_norm: 0.7708 (0.7595)  closs: 1.0746 (1.0522)  time: 4.1219  data: 0.0002  max mem: 54684
+[05:34:43.703986] Epoch: [1]  [1150/3229]  lr: 0.000034  grad_norm: 0.7656 (0.7593)  closs: 1.0766 (1.0521)  time: 4.1065  data: 0.0003  max mem: 54684
+[05:35:24.484939] Epoch: [1]  [1160/3229]  lr: 0.000034  grad_norm: 0.7151 (0.7591)  closs: 1.0380 (1.0517)  time: 4.0861  data: 0.0003  max mem: 54684
+[05:36:05.401846] Epoch: [1]  [1170/3229]  lr: 0.000033  grad_norm: 0.7522 (0.7591)  closs: 1.0499 (1.0519)  time: 4.0848  data: 0.0002  max mem: 54684
+[05:36:45.957264] Epoch: [1]  [1180/3229]  lr: 0.000033  grad_norm: 0.7522 (0.7588)  closs: 1.0587 (1.0517)  time: 4.0735  data: 0.0002  max mem: 54684
+[05:37:27.507925] Epoch: [1]  [1190/3229]  lr: 0.000033  grad_norm: 0.7522 (0.7589)  closs: 1.0445 (1.0518)  time: 4.1052  data: 0.0002  max mem: 54684
+[05:38:08.323255] Epoch: [1]  [1200/3229]  lr: 0.000033  grad_norm: 0.8299 (0.7594)  closs: 1.0666 (1.0517)  time: 4.1182  data: 0.0002  max mem: 54684
+[05:38:49.657429] Epoch: [1]  [1210/3229]  lr: 0.000033  grad_norm: 0.8322 (0.7596)  closs: 1.0805 (1.0520)  time: 4.1074  data: 0.0002  max mem: 54684
+[05:39:30.796365] Epoch: [1]  [1220/3229]  lr: 0.000033  grad_norm: 0.7634 (0.7597)  closs: 1.0942 (1.0524)  time: 4.1236  data: 0.0002  max mem: 54684
+[05:40:12.472149] Epoch: [1]  [1230/3229]  lr: 0.000033  grad_norm: 0.7564 (0.7597)  closs: 1.0754 (1.0523)  time: 4.1407  data: 0.0002  max mem: 54684
+[05:40:53.596788] Epoch: [1]  [1240/3229]  lr: 0.000033  grad_norm: 0.7499 (0.7610)  closs: 1.0346 (1.0525)  time: 4.1400  data: 0.0002  max mem: 54684
+[05:41:34.343746] Epoch: [1]  [1250/3229]  lr: 0.000033  grad_norm: 0.7309 (0.7605)  closs: 1.0346 (1.0524)  time: 4.0935  data: 0.0002  max mem: 54684
+[05:42:14.850396] Epoch: [1]  [1260/3229]  lr: 0.000033  grad_norm: 0.7357 (0.7604)  closs: 1.0046 (1.0521)  time: 4.0626  data: 0.0002  max mem: 54684
+[05:42:55.902233] Epoch: [1]  [1270/3229]  lr: 0.000033  grad_norm: 0.7459 (0.7603)  closs: 0.9939 (1.0517)  time: 4.0779  data: 0.0002  max mem: 54684
+[05:43:36.700285] Epoch: [1]  [1280/3229]  lr: 0.000033  grad_norm: 0.7424 (0.7602)  closs: 1.0316 (1.0518)  time: 4.0924  data: 0.0002  max mem: 54684
+[05:44:18.029631] Epoch: [1]  [1290/3229]  lr: 0.000033  grad_norm: 0.7584 (0.7604)  closs: 1.0437 (1.0518)  time: 4.1063  data: 0.0002  max mem: 54684
+[05:44:58.460022] Epoch: [1]  [1300/3229]  lr: 0.000032  grad_norm: 0.7471 (0.7602)  closs: 1.0371 (1.0516)  time: 4.0879  data: 0.0002  max mem: 54684
+[05:45:39.434680] Epoch: [1]  [1310/3229]  lr: 0.000032  grad_norm: 0.7390 (0.7601)  closs: 1.0832 (1.0518)  time: 4.0702  data: 0.0002  max mem: 54684
+[05:46:19.533138] Epoch: [1]  [1320/3229]  lr: 0.000032  grad_norm: 0.7437 (0.7601)  closs: 1.0843 (1.0517)  time: 4.0536  data: 0.0002  max mem: 54684
+[05:46:59.861164] Epoch: [1]  [1330/3229]  lr: 0.000032  grad_norm: 0.7416 (0.7597)  closs: 1.0420 (1.0514)  time: 4.0213  data: 0.0002  max mem: 54684
+[05:47:39.762335] Epoch: [1]  [1340/3229]  lr: 0.000032  grad_norm: 0.7236 (0.7592)  closs: 1.0353 (1.0512)  time: 4.0114  data: 0.0002  max mem: 54684
+[05:48:21.193611] Epoch: [1]  [1350/3229]  lr: 0.000032  grad_norm: 0.7413 (0.7594)  closs: 1.0614 (1.0513)  time: 4.0666  data: 0.0002  max mem: 54684
+[05:49:01.979501] Epoch: [1]  [1360/3229]  lr: 0.000032  grad_norm: 0.7393 (0.7591)  closs: 1.0725 (1.0512)  time: 4.1108  data: 0.0002  max mem: 54684
+[05:49:42.964934] Epoch: [1]  [1370/3229]  lr: 0.000032  grad_norm: 0.7161 (0.7588)  closs: 1.0531 (1.0512)  time: 4.0885  data: 0.0002  max mem: 54684
+[05:50:23.751298] Epoch: [1]  [1380/3229]  lr: 0.000032  grad_norm: 0.7334 (0.7589)  closs: 1.0633 (1.0516)  time: 4.0885  data: 0.0002  max mem: 54684
+[05:51:05.208897] Epoch: [1]  [1390/3229]  lr: 0.000032  grad_norm: 0.7776 (0.7591)  closs: 1.0852 (1.0519)  time: 4.1121  data: 0.0002  max mem: 54684
+[05:51:46.307285] Epoch: [1]  [1400/3229]  lr: 0.000032  grad_norm: 0.7727 (0.7591)  closs: 1.0727 (1.0521)  time: 4.1277  data: 0.0002  max mem: 54684
+[05:52:27.639855] Epoch: [1]  [1410/3229]  lr: 0.000032  grad_norm: 0.7589 (0.7592)  closs: 1.0593 (1.0520)  time: 4.1215  data: 0.0002  max mem: 54684
+[05:53:08.510464] Epoch: [1]  [1420/3229]  lr: 0.000032  grad_norm: 0.7454 (0.7591)  closs: 1.0521 (1.0518)  time: 4.1101  data: 0.0002  max mem: 54684
+[05:53:49.534800] Epoch: [1]  [1430/3229]  lr: 0.000031  grad_norm: 0.7239 (0.7589)  closs: 1.0510 (1.0515)  time: 4.0947  data: 0.0002  max mem: 54684
+[05:54:30.634251] Epoch: [1]  [1440/3229]  lr: 0.000031  grad_norm: 0.7376 (0.7591)  closs: 1.0525 (1.0516)  time: 4.1061  data: 0.0002  max mem: 54684
+[05:55:11.319746] Epoch: [1]  [1450/3229]  lr: 0.000031  grad_norm: 0.7381 (0.7590)  closs: 1.0587 (1.0514)  time: 4.0892  data: 0.0002  max mem: 54684
+[05:55:52.498426] Epoch: [1]  [1460/3229]  lr: 0.000031  grad_norm: 0.7446 (0.7591)  closs: 1.0462 (1.0513)  time: 4.0931  data: 0.0002  max mem: 54684
+[05:56:33.716084] Epoch: [1]  [1470/3229]  lr: 0.000031  grad_norm: 0.7441 (0.7589)  closs: 1.0490 (1.0514)  time: 4.1198  data: 0.0002  max mem: 54684
+[05:57:14.183953] Epoch: [1]  [1480/3229]  lr: 0.000031  grad_norm: 0.7353 (0.7586)  closs: 1.0471 (1.0514)  time: 4.0842  data: 0.0002  max mem: 54684
+[05:57:55.151729] Epoch: [1]  [1490/3229]  lr: 0.000031  grad_norm: 0.7442 (0.7585)  closs: 1.0415 (1.0514)  time: 4.0717  data: 0.0002  max mem: 54684
+[05:58:36.455544] Epoch: [1]  [1500/3229]  lr: 0.000031  grad_norm: 0.7452 (0.7585)  closs: 1.0665 (1.0515)  time: 4.1135  data: 0.0002  max mem: 54684
+[05:59:17.089096] Epoch: [1]  [1510/3229]  lr: 0.000031  grad_norm: 0.7419 (0.7581)  closs: 1.0889 (1.0517)  time: 4.0968  data: 0.0002  max mem: 54684
+[05:59:57.877708] Epoch: [1]  [1520/3229]  lr: 0.000031  grad_norm: 0.7463 (0.7583)  closs: 1.0725 (1.0517)  time: 4.0710  data: 0.0002  max mem: 54684
+[06:00:39.181341] Epoch: [1]  [1530/3229]  lr: 0.000031  grad_norm: 0.7860 (0.7584)  closs: 1.0767 (1.0519)  time: 4.1045  data: 0.0002  max mem: 54684
+[06:01:20.218941] Epoch: [1]  [1540/3229]  lr: 0.000031  grad_norm: 0.7508 (0.7584)  closs: 1.0767 (1.0517)  time: 4.1170  data: 0.0002  max mem: 54684
+[06:02:01.186493] Epoch: [1]  [1550/3229]  lr: 0.000031  grad_norm: 0.7369 (0.7583)  closs: 1.0714 (1.0517)  time: 4.1002  data: 0.0002  max mem: 54684
+[06:02:42.438164] Epoch: [1]  [1560/3229]  lr: 0.000030  grad_norm: 0.7650 (0.7584)  closs: 1.0850 (1.0520)  time: 4.1109  data: 0.0002  max mem: 54684
+[06:03:23.689124] Epoch: [1]  [1570/3229]  lr: 0.000030  grad_norm: 0.7622 (0.7584)  closs: 1.0709 (1.0521)  time: 4.1251  data: 0.0002  max mem: 54684
+[06:04:04.719315] Epoch: [1]  [1580/3229]  lr: 0.000030  grad_norm: 0.7378 (0.7583)  closs: 1.0553 (1.0520)  time: 4.1140  data: 0.0002  max mem: 54684
+[06:04:45.698136] Epoch: [1]  [1590/3229]  lr: 0.000030  grad_norm: 0.7576 (0.7584)  closs: 1.0282 (1.0517)  time: 4.1004  data: 0.0002  max mem: 54684
+[06:05:27.010602] Epoch: [1]  [1600/3229]  lr: 0.000030  grad_norm: 0.7794 (0.7588)  closs: 1.0155 (1.0516)  time: 4.1145  data: 0.0002  max mem: 54684
+[06:06:08.217639] Epoch: [1]  [1610/3229]  lr: 0.000030  grad_norm: 0.7703 (0.7590)  closs: 1.0618 (1.0518)  time: 4.1259  data: 0.0002  max mem: 54684
+[06:06:49.708388] Epoch: [1]  [1620/3229]  lr: 0.000030  grad_norm: 0.7704 (0.7592)  closs: 1.0795 (1.0519)  time: 4.1348  data: 0.0002  max mem: 54684
+[06:07:30.802140] Epoch: [1]  [1630/3229]  lr: 0.000030  grad_norm: 0.7723 (0.7593)  closs: 1.0570 (1.0521)  time: 4.1292  data: 0.0002  max mem: 54684
+[06:08:12.042914] Epoch: [1]  [1640/3229]  lr: 0.000030  grad_norm: 0.7690 (0.7593)  closs: 1.0839 (1.0523)  time: 4.1167  data: 0.0002  max mem: 54684
+[06:08:52.552185] Epoch: [1]  [1650/3229]  lr: 0.000030  grad_norm: 0.7620 (0.7592)  closs: 1.0666 (1.0522)  time: 4.0874  data: 0.0002  max mem: 54684
+[06:09:34.109821] Epoch: [1]  [1660/3229]  lr: 0.000030  grad_norm: 0.7443 (0.7592)  closs: 1.0719 (1.0526)  time: 4.1033  data: 0.0002  max mem: 54684
+[06:10:15.226675] Epoch: [1]  [1670/3229]  lr: 0.000030  grad_norm: 0.7640 (0.7592)  closs: 1.0737 (1.0524)  time: 4.1337  data: 0.0002  max mem: 54684
+[06:10:55.893166] Epoch: [1]  [1680/3229]  lr: 0.000030  grad_norm: 0.7557 (0.7591)  closs: 1.0457 (1.0524)  time: 4.0891  data: 0.0002  max mem: 54684
+[06:11:37.000880] Epoch: [1]  [1690/3229]  lr: 0.000029  grad_norm: 0.7449 (0.7591)  closs: 1.0646 (1.0525)  time: 4.0886  data: 0.0002  max mem: 54684
+[06:12:18.347873] Epoch: [1]  [1700/3229]  lr: 0.000029  grad_norm: 0.7646 (0.7592)  closs: 1.0581 (1.0523)  time: 4.1227  data: 0.0002  max mem: 54684
+[06:12:59.455957] Epoch: [1]  [1710/3229]  lr: 0.000029  grad_norm: 0.7725 (0.7593)  closs: 1.0392 (1.0523)  time: 4.1227  data: 0.0002  max mem: 54684
+[06:13:40.686492] Epoch: [1]  [1720/3229]  lr: 0.000029  grad_norm: 0.7731 (0.7595)  closs: 1.0769 (1.0526)  time: 4.1169  data: 0.0002  max mem: 54684
+[06:14:21.266645] Epoch: [1]  [1730/3229]  lr: 0.000029  grad_norm: 0.7730 (0.7594)  closs: 1.0708 (1.0525)  time: 4.0905  data: 0.0002  max mem: 54684
+[06:15:02.058571] Epoch: [1]  [1740/3229]  lr: 0.000029  grad_norm: 0.7642 (0.7593)  closs: 1.0508 (1.0524)  time: 4.0685  data: 0.0002  max mem: 54684
+[06:15:43.158278] Epoch: [1]  [1750/3229]  lr: 0.000029  grad_norm: 0.7606 (0.7594)  closs: 1.0358 (1.0525)  time: 4.0945  data: 0.0002  max mem: 54684
+[06:16:24.131577] Epoch: [1]  [1760/3229]  lr: 0.000029  grad_norm: 0.7606 (0.7593)  closs: 1.0631 (1.0525)  time: 4.1036  data: 0.0002  max mem: 54684
+[06:17:04.589751] Epoch: [1]  [1770/3229]  lr: 0.000029  grad_norm: 0.7370 (0.7591)  closs: 1.0766 (1.0524)  time: 4.0715  data: 0.0002  max mem: 54684
+[06:17:46.162753] Epoch: [1]  [1780/3229]  lr: 0.000029  grad_norm: 0.7053 (0.7589)  closs: 1.0610 (1.0525)  time: 4.1015  data: 0.0002  max mem: 54684
+[06:18:27.288021] Epoch: [1]  [1790/3229]  lr: 0.000029  grad_norm: 0.7304 (0.7589)  closs: 1.0519 (1.0525)  time: 4.1348  data: 0.0002  max mem: 54684
+[06:19:08.597875] Epoch: [1]  [1800/3229]  lr: 0.000029  grad_norm: 0.7718 (0.7593)  closs: 1.0343 (1.0524)  time: 4.1217  data: 0.0002  max mem: 54684
+[06:19:48.096823] Epoch: [1]  [1810/3229]  lr: 0.000028  grad_norm: 0.7355 (0.7589)  closs: 0.9675 (1.0521)  time: 4.0404  data: 0.0002  max mem: 54684
+[06:20:29.329365] Epoch: [1]  [1820/3229]  lr: 0.000028  grad_norm: 0.7154 (0.7589)  closs: 1.0411 (1.0522)  time: 4.0365  data: 0.0002  max mem: 54684
+[06:21:10.086608] Epoch: [1]  [1830/3229]  lr: 0.000028  grad_norm: 0.7412 (0.7589)  closs: 1.0599 (1.0522)  time: 4.0994  data: 0.0002  max mem: 54684
+[06:21:51.381549] Epoch: [1]  [1840/3229]  lr: 0.000028  grad_norm: 0.7621 (0.7589)  closs: 1.0664 (1.0523)  time: 4.1025  data: 0.0002  max mem: 54684
+[06:22:32.162442] Epoch: [1]  [1850/3229]  lr: 0.000028  grad_norm: 0.7662 (0.7590)  closs: 1.0554 (1.0522)  time: 4.1037  data: 0.0002  max mem: 54684
+[06:23:13.433902] Epoch: [1]  [1860/3229]  lr: 0.000028  grad_norm: 0.7852 (0.7590)  closs: 1.0432 (1.0521)  time: 4.1025  data: 0.0002  max mem: 54684
+[06:23:54.209307] Epoch: [1]  [1870/3229]  lr: 0.000028  grad_norm: 0.7674 (0.7591)  closs: 1.0409 (1.0521)  time: 4.1023  data: 0.0002  max mem: 54684
+[06:24:35.546365] Epoch: [1]  [1880/3229]  lr: 0.000028  grad_norm: 0.7487 (0.7591)  closs: 1.0409 (1.0522)  time: 4.1056  data: 0.0002  max mem: 54684
+[06:25:15.982687] Epoch: [1]  [1890/3229]  lr: 0.000028  grad_norm: 0.7683 (0.7591)  closs: 1.0359 (1.0521)  time: 4.0886  data: 0.0002  max mem: 54684
+[06:25:57.440510] Epoch: [1]  [1900/3229]  lr: 0.000028  grad_norm: 0.7670 (0.7592)  closs: 1.0557 (1.0522)  time: 4.0946  data: 0.0002  max mem: 54684
+[06:26:38.556623] Epoch: [1]  [1910/3229]  lr: 0.000028  grad_norm: 0.7661 (0.7592)  closs: 1.0564 (1.0521)  time: 4.1286  data: 0.0002  max mem: 54684
+[06:27:19.228566] Epoch: [1]  [1920/3229]  lr: 0.000028  grad_norm: 0.7542 (0.7590)  closs: 1.0294 (1.0521)  time: 4.0893  data: 0.0002  max mem: 54684
+[06:27:59.439243] Epoch: [1]  [1930/3229]  lr: 0.000028  grad_norm: 0.7197 (0.7587)  closs: 1.0121 (1.0517)  time: 4.0441  data: 0.0002  max mem: 54684
+[06:28:40.298280] Epoch: [1]  [1940/3229]  lr: 0.000027  grad_norm: 0.7343 (0.7586)  closs: 1.0121 (1.0518)  time: 4.0534  data: 0.0002  max mem: 54684
+[06:29:21.418176] Epoch: [1]  [1950/3229]  lr: 0.000027  grad_norm: 0.7666 (0.7588)  closs: 1.0867 (1.0519)  time: 4.0989  data: 0.0002  max mem: 54684
+[06:30:01.756178] Epoch: [1]  [1960/3229]  lr: 0.000027  grad_norm: 0.7619 (0.7587)  closs: 1.0618 (1.0519)  time: 4.0728  data: 0.0002  max mem: 54684
+[06:30:43.062087] Epoch: [1]  [1970/3229]  lr: 0.000027  grad_norm: 0.7588 (0.7588)  closs: 1.0484 (1.0518)  time: 4.0821  data: 0.0002  max mem: 54684
+[06:31:23.143763] Epoch: [1]  [1980/3229]  lr: 0.000027  grad_norm: 0.7496 (0.7584)  closs: 1.0332 (1.0516)  time: 4.0693  data: 0.0002  max mem: 54684
+[06:32:04.271038] Epoch: [1]  [1990/3229]  lr: 0.000027  grad_norm: 0.7496 (0.7585)  closs: 1.0258 (1.0515)  time: 4.0604  data: 0.0002  max mem: 54684
+[06:32:45.254334] Epoch: [1]  [2000/3229]  lr: 0.000027  grad_norm: 0.7825 (0.7587)  closs: 1.0567 (1.0515)  time: 4.1055  data: 0.0002  max mem: 54684
+[06:33:26.464223] Epoch: [1]  [2010/3229]  lr: 0.000027  grad_norm: 0.7723 (0.7588)  closs: 1.0622 (1.0516)  time: 4.1096  data: 0.0002  max mem: 54684
+[06:34:07.977608] Epoch: [1]  [2020/3229]  lr: 0.000027  grad_norm: 0.7569 (0.7588)  closs: 1.0776 (1.0517)  time: 4.1361  data: 0.0002  max mem: 54684
+[06:34:48.895957] Epoch: [1]  [2030/3229]  lr: 0.000027  grad_norm: 0.7560 (0.7586)  closs: 1.0790 (1.0517)  time: 4.1215  data: 0.0002  max mem: 54684
+[06:35:29.451828] Epoch: [1]  [2040/3229]  lr: 0.000027  grad_norm: 0.7256 (0.7584)  closs: 1.0240 (1.0514)  time: 4.0736  data: 0.0002  max mem: 54684
+[06:36:10.091914] Epoch: [1]  [2050/3229]  lr: 0.000027  grad_norm: 0.7247 (0.7581)  closs: 1.0561 (1.0513)  time: 4.0597  data: 0.0002  max mem: 54684
+[06:36:50.691448] Epoch: [1]  [2060/3229]  lr: 0.000027  grad_norm: 0.7255 (0.7579)  closs: 1.0204 (1.0512)  time: 4.0619  data: 0.0002  max mem: 54684
+[06:37:31.150888] Epoch: [1]  [2070/3229]  lr: 0.000026  grad_norm: 0.7327 (0.7578)  closs: 1.0344 (1.0512)  time: 4.0529  data: 0.0002  max mem: 54684
+[06:38:11.548200] Epoch: [1]  [2080/3229]  lr: 0.000026  grad_norm: 0.7327 (0.7576)  closs: 1.0414 (1.0511)  time: 4.0428  data: 0.0002  max mem: 54684
+[06:38:51.240169] Epoch: [1]  [2090/3229]  lr: 0.000026  grad_norm: 0.7045 (0.7574)  closs: 0.9855 (1.0507)  time: 4.0044  data: 0.0002  max mem: 54684
+[06:39:32.307282] Epoch: [1]  [2100/3229]  lr: 0.000026  grad_norm: 0.7700 (0.7575)  closs: 0.9816 (1.0506)  time: 4.0379  data: 0.0002  max mem: 54684
+[06:40:12.900721] Epoch: [1]  [2110/3229]  lr: 0.000026  grad_norm: 0.7683 (0.7572)  closs: 1.0346 (1.0504)  time: 4.0830  data: 0.0002  max mem: 54684
+[06:40:54.073838] Epoch: [1]  [2120/3229]  lr: 0.000026  grad_norm: 0.7175 (0.7572)  closs: 1.0233 (1.0503)  time: 4.0883  data: 0.0002  max mem: 54684
+[06:41:34.494204] Epoch: [1]  [2130/3229]  lr: 0.000026  grad_norm: 0.7567 (0.7575)  closs: 1.0420 (1.0504)  time: 4.0796  data: 0.0002  max mem: 54684
+[06:42:14.786527] Epoch: [1]  [2140/3229]  lr: 0.000026  grad_norm: 0.7746 (0.7574)  closs: 1.0572 (1.0504)  time: 4.0356  data: 0.0002  max mem: 54684
+[06:42:56.042219] Epoch: [1]  [2150/3229]  lr: 0.000026  grad_norm: 0.7436 (0.7574)  closs: 1.0572 (1.0506)  time: 4.0773  data: 0.0002  max mem: 54684
+[06:43:36.873461] Epoch: [1]  [2160/3229]  lr: 0.000026  grad_norm: 0.7565 (0.7574)  closs: 1.1052 (1.0509)  time: 4.1043  data: 0.0002  max mem: 54684
+[06:44:18.036372] Epoch: [1]  [2170/3229]  lr: 0.000026  grad_norm: 0.7559 (0.7573)  closs: 1.0642 (1.0508)  time: 4.0996  data: 0.0002  max mem: 54684
+[06:44:58.954633] Epoch: [1]  [2180/3229]  lr: 0.000026  grad_norm: 0.7512 (0.7574)  closs: 1.0519 (1.0507)  time: 4.1040  data: 0.0002  max mem: 54684
+[06:45:39.550709] Epoch: [1]  [2190/3229]  lr: 0.000026  grad_norm: 0.7721 (0.7580)  closs: 1.0434 (1.0508)  time: 4.0757  data: 0.0002  max mem: 54684
+[06:46:20.109102] Epoch: [1]  [2200/3229]  lr: 0.000025  grad_norm: 0.7721 (0.7579)  closs: 1.0396 (1.0507)  time: 4.0577  data: 0.0002  max mem: 54684
+[06:47:01.387992] Epoch: [1]  [2210/3229]  lr: 0.000025  grad_norm: 0.7774 (0.7582)  closs: 1.0393 (1.0508)  time: 4.0918  data: 0.0002  max mem: 54684
+[06:47:42.292906] Epoch: [1]  [2220/3229]  lr: 0.000025  grad_norm: 0.7702 (0.7580)  closs: 1.0655 (1.0509)  time: 4.1091  data: 0.0002  max mem: 54684
+[06:48:22.615690] Epoch: [1]  [2230/3229]  lr: 0.000025  grad_norm: 0.7125 (0.7578)  closs: 1.0602 (1.0509)  time: 4.0613  data: 0.0002  max mem: 54684
+[06:49:03.122150] Epoch: [1]  [2240/3229]  lr: 0.000025  grad_norm: 0.7564 (0.7580)  closs: 1.0392 (1.0507)  time: 4.0414  data: 0.0002  max mem: 54684
+[06:49:43.913988] Epoch: [1]  [2250/3229]  lr: 0.000025  grad_norm: 0.7275 (0.7587)  closs: 1.0392 (1.0508)  time: 4.0649  data: 0.0002  max mem: 54684
+[06:50:24.836181] Epoch: [1]  [2260/3229]  lr: 0.000025  grad_norm: 0.7275 (0.7588)  closs: 1.0988 (1.0510)  time: 4.0856  data: 0.0002  max mem: 54684
+[06:51:05.820250] Epoch: [1]  [2270/3229]  lr: 0.000025  grad_norm: 0.7384 (0.7587)  closs: 1.0513 (1.0508)  time: 4.0953  data: 0.0002  max mem: 54684
+[06:51:46.269588] Epoch: [1]  [2280/3229]  lr: 0.000025  grad_norm: 0.7730 (0.7588)  closs: 1.0402 (1.0509)  time: 4.0716  data: 0.0002  max mem: 54684
+[06:52:27.410955] Epoch: [1]  [2290/3229]  lr: 0.000025  grad_norm: 0.7238 (0.7586)  closs: 1.0385 (1.0507)  time: 4.0795  data: 0.0002  max mem: 54684
+[06:53:08.358905] Epoch: [1]  [2300/3229]  lr: 0.000025  grad_norm: 0.7204 (0.7586)  closs: 1.0344 (1.0506)  time: 4.1044  data: 0.0002  max mem: 54684
+[06:53:49.351820] Epoch: [1]  [2310/3229]  lr: 0.000025  grad_norm: 0.7408 (0.7585)  closs: 1.0498 (1.0506)  time: 4.0970  data: 0.0002  max mem: 54684
+[06:54:30.150638] Epoch: [1]  [2320/3229]  lr: 0.000025  grad_norm: 0.7644 (0.7586)  closs: 1.0665 (1.0507)  time: 4.0895  data: 0.0002  max mem: 54684
+[06:55:11.609155] Epoch: [1]  [2330/3229]  lr: 0.000024  grad_norm: 0.7644 (0.7586)  closs: 1.0783 (1.0508)  time: 4.1128  data: 0.0002  max mem: 54684
+[06:55:52.526582] Epoch: [1]  [2340/3229]  lr: 0.000024  grad_norm: 0.7481 (0.7587)  closs: 1.0944 (1.0510)  time: 4.1187  data: 0.0002  max mem: 54684
+[06:56:33.528322] Epoch: [1]  [2350/3229]  lr: 0.000024  grad_norm: 0.7847 (0.7587)  closs: 1.1101 (1.0511)  time: 4.0959  data: 0.0002  max mem: 54684
+[06:57:13.657474] Epoch: [1]  [2360/3229]  lr: 0.000024  grad_norm: 0.7256 (0.7585)  closs: 1.0694 (1.0511)  time: 4.0565  data: 0.0002  max mem: 54684
+[06:57:54.348834] Epoch: [1]  [2370/3229]  lr: 0.000024  grad_norm: 0.7192 (0.7584)  closs: 1.0375 (1.0509)  time: 4.0410  data: 0.0002  max mem: 54684
+[06:58:35.300759] Epoch: [1]  [2380/3229]  lr: 0.000024  grad_norm: 0.7204 (0.7584)  closs: 1.0375 (1.0509)  time: 4.0821  data: 0.0002  max mem: 54684
+[06:59:15.644906] Epoch: [1]  [2390/3229]  lr: 0.000024  grad_norm: 0.7417 (0.7583)  closs: 1.0504 (1.0508)  time: 4.0647  data: 0.0002  max mem: 54684
+[06:59:56.423928] Epoch: [1]  [2400/3229]  lr: 0.000024  grad_norm: 0.7580 (0.7583)  closs: 1.0390 (1.0508)  time: 4.0561  data: 0.0002  max mem: 54684
+[07:00:36.732262] Epoch: [1]  [2410/3229]  lr: 0.000024  grad_norm: 0.7551 (0.7580)  closs: 1.0390 (1.0508)  time: 4.0543  data: 0.0002  max mem: 54684
+[07:01:17.509223] Epoch: [1]  [2420/3229]  lr: 0.000024  grad_norm: 0.7224 (0.7579)  closs: 1.0253 (1.0507)  time: 4.0542  data: 0.0002  max mem: 54684
+[07:01:59.103175] Epoch: [1]  [2430/3229]  lr: 0.000024  grad_norm: 0.7534 (0.7580)  closs: 1.0289 (1.0507)  time: 4.1185  data: 0.0002  max mem: 54684
+[07:02:39.633371] Epoch: [1]  [2440/3229]  lr: 0.000024  grad_norm: 0.7523 (0.7579)  closs: 1.0303 (1.0506)  time: 4.1061  data: 0.0002  max mem: 54684
+[07:03:21.099615] Epoch: [1]  [2450/3229]  lr: 0.000024  grad_norm: 0.7399 (0.7580)  closs: 1.0666 (1.0508)  time: 4.0998  data: 0.0002  max mem: 54684
+[07:04:01.876483] Epoch: [1]  [2460/3229]  lr: 0.000023  grad_norm: 0.7601 (0.7597)  closs: 1.0710 (1.0509)  time: 4.1121  data: 0.0002  max mem: 54684
+[07:04:42.177832] Epoch: [1]  [2470/3229]  lr: 0.000023  grad_norm: 0.7070 (0.7594)  closs: 1.0346 (1.0508)  time: 4.0538  data: 0.0002  max mem: 54684
+[07:05:22.730871] Epoch: [1]  [2480/3229]  lr: 0.000023  grad_norm: 0.6798 (0.7592)  closs: 1.0346 (1.0508)  time: 4.0427  data: 0.0002  max mem: 54684
+[07:06:04.230691] Epoch: [1]  [2490/3229]  lr: 0.000023  grad_norm: 0.7520 (0.7593)  closs: 1.0195 (1.0507)  time: 4.1026  data: 0.0002  max mem: 54684
+[07:06:44.671239] Epoch: [1]  [2500/3229]  lr: 0.000023  grad_norm: 0.7559 (0.7592)  closs: 1.0066 (1.0506)  time: 4.0970  data: 0.0002  max mem: 54684
+[07:07:25.306096] Epoch: [1]  [2510/3229]  lr: 0.000023  grad_norm: 0.7394 (0.7590)  closs: 1.0125 (1.0504)  time: 4.0537  data: 0.0002  max mem: 54684
+[07:08:05.672440] Epoch: [1]  [2520/3229]  lr: 0.000023  grad_norm: 0.7341 (0.7589)  closs: 1.0304 (1.0502)  time: 4.0500  data: 0.0002  max mem: 54684
+[07:08:46.342159] Epoch: [1]  [2530/3229]  lr: 0.000023  grad_norm: 0.7582 (0.7590)  closs: 1.0381 (1.0503)  time: 4.0517  data: 0.0002  max mem: 54684
+[07:09:27.086959] Epoch: [1]  [2540/3229]  lr: 0.000023  grad_norm: 0.7684 (0.7590)  closs: 1.0815 (1.0503)  time: 4.0707  data: 0.0002  max mem: 54684
+[07:10:08.439021] Epoch: [1]  [2550/3229]  lr: 0.000023  grad_norm: 0.7684 (0.7590)  closs: 1.0815 (1.0505)  time: 4.1048  data: 0.0002  max mem: 54684
+[07:10:49.035524] Epoch: [1]  [2560/3229]  lr: 0.000023  grad_norm: 0.7595 (0.7590)  closs: 1.0433 (1.0504)  time: 4.0974  data: 0.0002  max mem: 54684
+[07:11:30.100693] Epoch: [1]  [2570/3229]  lr: 0.000023  grad_norm: 0.7435 (0.7588)  closs: 1.0562 (1.0506)  time: 4.0830  data: 0.0002  max mem: 54684
+[07:12:11.188418] Epoch: [1]  [2580/3229]  lr: 0.000023  grad_norm: 0.7404 (0.7589)  closs: 1.1022 (1.0507)  time: 4.1076  data: 0.0002  max mem: 54684
+[07:12:52.475637] Epoch: [1]  [2590/3229]  lr: 0.000022  grad_norm: 0.7772 (0.7591)  closs: 1.0955 (1.0509)  time: 4.1187  data: 0.0002  max mem: 54684
+[07:13:33.482943] Epoch: [1]  [2600/3229]  lr: 0.000022  grad_norm: 0.7876 (0.7590)  closs: 1.0745 (1.0508)  time: 4.1147  data: 0.0002  max mem: 54684
+[07:14:14.212298] Epoch: [1]  [2610/3229]  lr: 0.000022  grad_norm: 0.7625 (0.7589)  closs: 1.0001 (1.0507)  time: 4.0868  data: 0.0002  max mem: 54684
+[07:14:55.108401] Epoch: [1]  [2620/3229]  lr: 0.000022  grad_norm: 0.7431 (0.7588)  closs: 1.0001 (1.0506)  time: 4.0812  data: 0.0002  max mem: 54684
+[07:15:36.195488] Epoch: [1]  [2630/3229]  lr: 0.000022  grad_norm: 0.7435 (0.7590)  closs: 1.0573 (1.0508)  time: 4.0991  data: 0.0002  max mem: 54684
+[07:16:17.647209] Epoch: [1]  [2640/3229]  lr: 0.000022  grad_norm: 0.7787 (0.7590)  closs: 1.0782 (1.0508)  time: 4.1269  data: 0.0002  max mem: 54684
+[07:16:58.892659] Epoch: [1]  [2650/3229]  lr: 0.000022  grad_norm: 0.7769 (0.7590)  closs: 1.0465 (1.0508)  time: 4.1348  data: 0.0002  max mem: 54684
+[07:17:39.799115] Epoch: [1]  [2660/3229]  lr: 0.000022  grad_norm: 0.7668 (0.7590)  closs: 1.0389 (1.0509)  time: 4.1075  data: 0.0002  max mem: 54684
+[07:18:20.650599] Epoch: [1]  [2670/3229]  lr: 0.000022  grad_norm: 0.7837 (0.7592)  closs: 1.0408 (1.0508)  time: 4.0878  data: 0.0002  max mem: 54684
+[07:19:01.954445] Epoch: [1]  [2680/3229]  lr: 0.000022  grad_norm: 0.7896 (0.7591)  closs: 1.0526 (1.0508)  time: 4.1077  data: 0.0002  max mem: 54684
+[07:19:42.555933] Epoch: [1]  [2690/3229]  lr: 0.000022  grad_norm: 0.7348 (0.7591)  closs: 1.0526 (1.0508)  time: 4.0952  data: 0.0002  max mem: 54684
+[07:20:23.489822] Epoch: [1]  [2700/3229]  lr: 0.000022  grad_norm: 0.7487 (0.7592)  closs: 1.0357 (1.0508)  time: 4.0767  data: 0.0002  max mem: 54684
+[07:21:03.709034] Epoch: [1]  [2710/3229]  lr: 0.000022  grad_norm: 0.7536 (0.7593)  closs: 1.0253 (1.0506)  time: 4.0576  data: 0.0002  max mem: 54684
+[07:21:44.808335] Epoch: [1]  [2720/3229]  lr: 0.000021  grad_norm: 0.7750 (0.7593)  closs: 1.0362 (1.0505)  time: 4.0659  data: 0.0002  max mem: 54684
+[07:22:26.050486] Epoch: [1]  [2730/3229]  lr: 0.000021  grad_norm: 0.7431 (0.7593)  closs: 1.0379 (1.0505)  time: 4.1170  data: 0.0002  max mem: 54684
+[07:23:06.929188] Epoch: [1]  [2740/3229]  lr: 0.000021  grad_norm: 0.7314 (0.7592)  closs: 1.0382 (1.0505)  time: 4.1060  data: 0.0002  max mem: 54684
+[07:23:47.376913] Epoch: [1]  [2750/3229]  lr: 0.000021  grad_norm: 0.7727 (0.7592)  closs: 1.0259 (1.0503)  time: 4.0663  data: 0.0002  max mem: 54684
+[07:24:28.781962] Epoch: [1]  [2760/3229]  lr: 0.000021  grad_norm: 0.7543 (0.7591)  closs: 1.0343 (1.0504)  time: 4.0926  data: 0.0002  max mem: 54684
+[07:25:09.722107] Epoch: [1]  [2770/3229]  lr: 0.000021  grad_norm: 0.7543 (0.7592)  closs: 1.0747 (1.0505)  time: 4.1172  data: 0.0002  max mem: 54684
+[07:25:50.621826] Epoch: [1]  [2780/3229]  lr: 0.000021  grad_norm: 0.7577 (0.7591)  closs: 1.0669 (1.0505)  time: 4.0919  data: 0.0002  max mem: 54684
+[07:26:30.847780] Epoch: [1]  [2790/3229]  lr: 0.000021  grad_norm: 0.7133 (0.7590)  closs: 1.0389 (1.0505)  time: 4.0562  data: 0.0002  max mem: 54684
+[07:27:12.004919] Epoch: [1]  [2800/3229]  lr: 0.000021  grad_norm: 0.7479 (0.7588)  closs: 1.0389 (1.0505)  time: 4.0691  data: 0.0002  max mem: 54684
+[07:27:52.590244] Epoch: [1]  [2810/3229]  lr: 0.000021  grad_norm: 0.7538 (0.7590)  closs: 1.0261 (1.0503)  time: 4.0871  data: 0.0002  max mem: 54684
+[07:28:33.829143] Epoch: [1]  [2820/3229]  lr: 0.000021  grad_norm: 0.7608 (0.7591)  closs: 1.0348 (1.0505)  time: 4.0911  data: 0.0002  max mem: 54684
+[07:29:14.344969] Epoch: [1]  [2830/3229]  lr: 0.000021  grad_norm: 0.7575 (0.7589)  closs: 1.0923 (1.0506)  time: 4.0877  data: 0.0002  max mem: 54684
+[07:29:54.993752] Epoch: [1]  [2840/3229]  lr: 0.000021  grad_norm: 0.7351 (0.7589)  closs: 1.0629 (1.0506)  time: 4.0582  data: 0.0002  max mem: 54684
+[07:30:35.251355] Epoch: [1]  [2850/3229]  lr: 0.000020  grad_norm: 0.7529 (0.7587)  closs: 1.0003 (1.0504)  time: 4.0453  data: 0.0002  max mem: 54684
+[07:31:15.852435] Epoch: [1]  [2860/3229]  lr: 0.000020  grad_norm: 0.7370 (0.7586)  closs: 1.0153 (1.0504)  time: 4.0429  data: 0.0002  max mem: 54684
+[07:31:56.405734] Epoch: [1]  [2870/3229]  lr: 0.000020  grad_norm: 0.7265 (0.7583)  closs: 1.0422 (1.0503)  time: 4.0577  data: 0.0002  max mem: 54684
+[07:32:37.536609] Epoch: [1]  [2880/3229]  lr: 0.000020  grad_norm: 0.7202 (0.7582)  closs: 1.0430 (1.0504)  time: 4.0841  data: 0.0002  max mem: 54684
+[07:33:18.137999] Epoch: [1]  [2890/3229]  lr: 0.000020  grad_norm: 0.7069 (0.7580)  closs: 1.0299 (1.0504)  time: 4.0866  data: 0.0002  max mem: 54684
+[07:33:59.054025] Epoch: [1]  [2900/3229]  lr: 0.000020  grad_norm: 0.7126 (0.7579)  closs: 0.9991 (1.0503)  time: 4.0758  data: 0.0002  max mem: 54684
+[07:34:40.302514] Epoch: [1]  [2910/3229]  lr: 0.000020  grad_norm: 0.7618 (0.7580)  closs: 1.0246 (1.0504)  time: 4.1082  data: 0.0002  max mem: 54684
+[07:35:21.613397] Epoch: [1]  [2920/3229]  lr: 0.000020  grad_norm: 0.7726 (0.7581)  closs: 1.0246 (1.0504)  time: 4.1279  data: 0.0002  max mem: 54684
+[07:36:01.885012] Epoch: [1]  [2930/3229]  lr: 0.000020  grad_norm: 0.7795 (0.7580)  closs: 1.0148 (1.0502)  time: 4.0791  data: 0.0002  max mem: 54684
+[07:36:42.517521] Epoch: [1]  [2940/3229]  lr: 0.000020  grad_norm: 0.7532 (0.7580)  closs: 1.0222 (1.0501)  time: 4.0451  data: 0.0002  max mem: 54684
+[07:37:22.983411] Epoch: [1]  [2950/3229]  lr: 0.000020  grad_norm: 0.7556 (0.7579)  closs: 1.0219 (1.0500)  time: 4.0549  data: 0.0002  max mem: 54684
+[07:38:03.810435] Epoch: [1]  [2960/3229]  lr: 0.000020  grad_norm: 0.7329 (0.7577)  closs: 1.0213 (1.0499)  time: 4.0646  data: 0.0002  max mem: 54684
+[07:38:44.734197] Epoch: [1]  [2970/3229]  lr: 0.000020  grad_norm: 0.7669 (0.7578)  closs: 1.0230 (1.0499)  time: 4.0875  data: 0.0002  max mem: 54684
+[07:39:25.067815] Epoch: [1]  [2980/3229]  lr: 0.000020  grad_norm: 0.7443 (0.7575)  closs: 1.0170 (1.0497)  time: 4.0628  data: 0.0002  max mem: 54684
+[07:40:06.277140] Epoch: [1]  [2990/3229]  lr: 0.000019  grad_norm: 0.7237 (0.7575)  closs: 1.0334 (1.0498)  time: 4.0771  data: 0.0002  max mem: 54684
+[07:40:47.518619] Epoch: [1]  [3000/3229]  lr: 0.000019  grad_norm: 0.7443 (0.7576)  closs: 1.0808 (1.0499)  time: 4.1225  data: 0.0002  max mem: 54684
+[07:41:28.756394] Epoch: [1]  [3010/3229]  lr: 0.000019  grad_norm: 0.7306 (0.7576)  closs: 1.0731 (1.0499)  time: 4.1239  data: 0.0002  max mem: 54684
+[07:42:09.820428] Epoch: [1]  [3020/3229]  lr: 0.000019  grad_norm: 0.7376 (0.7576)  closs: 1.0600 (1.0498)  time: 4.1150  data: 0.0002  max mem: 54684
+[07:42:50.371264] Epoch: [1]  [3030/3229]  lr: 0.000019  grad_norm: 0.7764 (0.7576)  closs: 1.0239 (1.0497)  time: 4.0807  data: 0.0002  max mem: 54684
+[07:43:31.735769] Epoch: [1]  [3040/3229]  lr: 0.000019  grad_norm: 0.8111 (0.7577)  closs: 1.0480 (1.0498)  time: 4.0957  data: 0.0002  max mem: 54684
+[07:44:12.340935] Epoch: [1]  [3050/3229]  lr: 0.000019  grad_norm: 0.7614 (0.7576)  closs: 1.0513 (1.0498)  time: 4.0984  data: 0.0002  max mem: 54684
+[07:44:53.338278] Epoch: [1]  [3060/3229]  lr: 0.000019  grad_norm: 0.7472 (0.7577)  closs: 1.0586 (1.0498)  time: 4.0801  data: 0.0002  max mem: 54684
+[07:45:33.983619] Epoch: [1]  [3070/3229]  lr: 0.000019  grad_norm: 0.7472 (0.7576)  closs: 1.0348 (1.0497)  time: 4.0821  data: 0.0002  max mem: 54684
+[07:46:14.880787] Epoch: [1]  [3080/3229]  lr: 0.000019  grad_norm: 0.7548 (0.7576)  closs: 1.0463 (1.0498)  time: 4.0771  data: 0.0002  max mem: 54684
+[07:46:55.135508] Epoch: [1]  [3090/3229]  lr: 0.000019  grad_norm: 0.7275 (0.7574)  closs: 1.0621 (1.0497)  time: 4.0575  data: 0.0002  max mem: 54684
+[07:47:35.489073] Epoch: [1]  [3100/3229]  lr: 0.000019  grad_norm: 0.7098 (0.7573)  closs: 0.9939 (1.0496)  time: 4.0303  data: 0.0002  max mem: 54684
+[07:48:16.842485] Epoch: [1]  [3110/3229]  lr: 0.000019  grad_norm: 0.7606 (0.7574)  closs: 1.0400 (1.0497)  time: 4.0853  data: 0.0002  max mem: 54684
+[07:48:58.060831] Epoch: [1]  [3120/3229]  lr: 0.000019  grad_norm: 0.7567 (0.7574)  closs: 1.0607 (1.0498)  time: 4.1285  data: 0.0002  max mem: 54684
+[07:49:39.119706] Epoch: [1]  [3130/3229]  lr: 0.000018  grad_norm: 0.7286 (0.7572)  closs: 1.0710 (1.0499)  time: 4.1138  data: 0.0002  max mem: 54684
+[07:50:19.961668] Epoch: [1]  [3140/3229]  lr: 0.000018  grad_norm: 0.7239 (0.7572)  closs: 1.0326 (1.0498)  time: 4.0950  data: 0.0002  max mem: 54684
+[07:51:00.627490] Epoch: [1]  [3150/3229]  lr: 0.000018  grad_norm: 0.7701 (0.7572)  closs: 1.0245 (1.0498)  time: 4.0753  data: 0.0002  max mem: 54684
+[07:51:41.096988] Epoch: [1]  [3160/3229]  lr: 0.000018  grad_norm: 0.7701 (0.7572)  closs: 1.0417 (1.0497)  time: 4.0567  data: 0.0002  max mem: 54684
+[07:52:21.540824] Epoch: [1]  [3170/3229]  lr: 0.000018  grad_norm: 0.7199 (0.7571)  closs: 1.0092 (1.0496)  time: 4.0456  data: 0.0002  max mem: 54684
+[07:53:01.782122] Epoch: [1]  [3180/3229]  lr: 0.000018  grad_norm: 0.7290 (0.7570)  closs: 1.0158 (1.0495)  time: 4.0342  data: 0.0002  max mem: 54684
+[07:53:42.520581] Epoch: [1]  [3190/3229]  lr: 0.000018  grad_norm: 0.7290 (0.7569)  closs: 1.0676 (1.0495)  time: 4.0489  data: 0.0002  max mem: 54684
+[07:54:22.768996] Epoch: [1]  [3200/3229]  lr: 0.000018  grad_norm: 0.7260 (0.7568)  closs: 1.0775 (1.0495)  time: 4.0493  data: 0.0002  max mem: 54684
+[07:55:03.308755] Epoch: [1]  [3210/3229]  lr: 0.000018  grad_norm: 0.7512 (0.7568)  closs: 1.0424 (1.0494)  time: 4.0393  data: 0.0002  max mem: 54684
+[07:55:44.129491] Epoch: [1]  [3220/3229]  lr: 0.000018  grad_norm: 0.7508 (0.7567)  closs: 1.0736 (1.0494)  time: 4.0680  data: 0.0001  max mem: 54684
+[07:56:16.699747] Epoch: [1] Total time: 3:40:05
+[07:56:16.736123] Averaged stats: lr: 0.000018  grad_norm: 0.7281 (0.7566)  closs: 1.0383 (1.0483)
+[07:56:17.102213] model saved
+[07:56:18.915517] optimizer saved
+[07:56:18.916157] other rank-common saved
+[07:56:18.922035] rank-specific saved
+[07:56:18.936513] log_dir: ./output_dir
+[07:56:31.268719] Epoch: [2]  [0/3229]  lr: 0.000018  grad_norm: 0.8262 (0.8262)  closs: 1.0084 (1.0084)  time: 12.3313  data: 8.2426  max mem: 54684
+[07:57:12.183294] Epoch: [2]  [10/3229]  lr: 0.000018  grad_norm: 0.7358 (0.7439)  closs: 1.0242 (1.0370)  time: 4.8405  data: 0.7495  max mem: 54684
+[07:57:52.839749] Epoch: [2]  [20/3229]  lr: 0.000018  grad_norm: 0.7358 (0.7614)  closs: 1.0264 (1.0467)  time: 4.0785  data: 0.0002  max mem: 54684
+[07:58:33.435224] Epoch: [2]  [30/3229]  lr: 0.000018  grad_norm: 0.7811 (0.7651)  closs: 1.0351 (1.0354)  time: 4.0625  data: 0.0002  max mem: 54684
+[07:59:15.317025] Epoch: [2]  [40/3229]  lr: 0.000017  grad_norm: 0.7708 (0.7695)  closs: 1.0460 (1.0453)  time: 4.1238  data: 0.0002  max mem: 54684
+[07:59:56.906136] Epoch: [2]  [50/3229]  lr: 0.000017  grad_norm: 0.7742 (0.7745)  closs: 1.0633 (1.0491)  time: 4.1735  data: 0.0002  max mem: 54684
+[08:00:37.793068] Epoch: [2]  [60/3229]  lr: 0.000017  grad_norm: 0.7780 (0.7750)  closs: 1.0396 (1.0518)  time: 4.1237  data: 0.0002  max mem: 54684
+[08:01:17.989880] Epoch: [2]  [70/3229]  lr: 0.000017  grad_norm: 0.7406 (0.7669)  closs: 1.0464 (1.0460)  time: 4.0541  data: 0.0002  max mem: 54684
+[08:01:59.026748] Epoch: [2]  [80/3229]  lr: 0.000017  grad_norm: 0.7321 (0.7684)  closs: 1.0689 (1.0462)  time: 4.0616  data: 0.0002  max mem: 54684
+[08:02:39.476084] Epoch: [2]  [90/3229]  lr: 0.000017  grad_norm: 0.7710 (0.7672)  closs: 1.0697 (1.0457)  time: 4.0742  data: 0.0002  max mem: 54684
+[08:03:20.882001] Epoch: [2]  [100/3229]  lr: 0.000017  grad_norm: 0.7731 (0.7683)  closs: 1.0697 (1.0507)  time: 4.0927  data: 0.0002  max mem: 54684
+[08:04:01.732738] Epoch: [2]  [110/3229]  lr: 0.000017  grad_norm: 0.7636 (0.7693)  closs: 1.0633 (1.0496)  time: 4.1128  data: 0.0002  max mem: 54684
+[08:04:43.243616] Epoch: [2]  [120/3229]  lr: 0.000017  grad_norm: 0.7653 (0.7695)  closs: 1.0444 (1.0471)  time: 4.1180  data: 0.0002  max mem: 54684
+[08:05:24.640326] Epoch: [2]  [130/3229]  lr: 0.000017  grad_norm: 0.7895 (0.7719)  closs: 1.0530 (1.0490)  time: 4.1453  data: 0.0002  max mem: 54684
+[08:06:04.919803] Epoch: [2]  [140/3229]  lr: 0.000017  grad_norm: 0.7767 (0.7695)  closs: 1.0625 (1.0491)  time: 4.0837  data: 0.0002  max mem: 54684
+[08:06:45.485984] Epoch: [2]  [150/3229]  lr: 0.000017  grad_norm: 0.7487 (0.7669)  closs: 1.0756 (1.0476)  time: 4.0422  data: 0.0002  max mem: 54684
+[08:07:26.589317] Epoch: [2]  [160/3229]  lr: 0.000017  grad_norm: 0.7543 (0.7668)  closs: 1.0570 (1.0491)  time: 4.0834  data: 0.0002  max mem: 54684
+[08:08:07.809141] Epoch: [2]  [170/3229]  lr: 0.000017  grad_norm: 0.7705 (0.7656)  closs: 1.0286 (1.0475)  time: 4.1161  data: 0.0002  max mem: 54684
+[08:08:48.382644] Epoch: [2]  [180/3229]  lr: 0.000016  grad_norm: 0.7462 (0.7654)  closs: 1.0195 (1.0463)  time: 4.0896  data: 0.0002  max mem: 54684
+[08:09:29.267697] Epoch: [2]  [190/3229]  lr: 0.000016  grad_norm: 0.7566 (0.7639)  closs: 1.0265 (1.0465)  time: 4.0729  data: 0.0002  max mem: 54684
+[08:10:09.377776] Epoch: [2]  [200/3229]  lr: 0.000016  grad_norm: 0.7441 (0.7628)  closs: 1.0265 (1.0435)  time: 4.0497  data: 0.0002  max mem: 54684
+[08:10:49.302139] Epoch: [2]  [210/3229]  lr: 0.000016  grad_norm: 0.6802 (0.7596)  closs: 0.9949 (1.0425)  time: 4.0017  data: 0.0002  max mem: 54684
+[08:11:30.519323] Epoch: [2]  [220/3229]  lr: 0.000016  grad_norm: 0.7500 (0.7609)  closs: 1.0888 (1.0456)  time: 4.0570  data: 0.0002  max mem: 54684
+[08:12:11.704337] Epoch: [2]  [230/3229]  lr: 0.000016  grad_norm: 0.7773 (0.7609)  closs: 1.0791 (1.0452)  time: 4.1200  data: 0.0002  max mem: 54684
+[08:12:52.393373] Epoch: [2]  [240/3229]  lr: 0.000016  grad_norm: 0.7641 (0.7595)  closs: 1.0536 (1.0445)  time: 4.0936  data: 0.0002  max mem: 54684
+[08:13:33.064691] Epoch: [2]  [250/3229]  lr: 0.000016  grad_norm: 0.7542 (0.7585)  closs: 1.0588 (1.0449)  time: 4.0680  data: 0.0002  max mem: 54684
+[08:14:14.045473] Epoch: [2]  [260/3229]  lr: 0.000016  grad_norm: 0.7569 (0.7583)  closs: 1.0562 (1.0460)  time: 4.0825  data: 0.0002  max mem: 54684
+[08:14:54.905661] Epoch: [2]  [270/3229]  lr: 0.000016  grad_norm: 0.7574 (0.7579)  closs: 1.0383 (1.0455)  time: 4.0920  data: 0.0002  max mem: 54684
+[08:15:35.926717] Epoch: [2]  [280/3229]  lr: 0.000016  grad_norm: 0.7558 (0.7577)  closs: 1.0300 (1.0455)  time: 4.0940  data: 0.0002  max mem: 54684
+[08:16:16.814567] Epoch: [2]  [290/3229]  lr: 0.000016  grad_norm: 0.7416 (0.7564)  closs: 1.0680 (1.0467)  time: 4.0954  data: 0.0002  max mem: 54684
+[08:16:58.068460] Epoch: [2]  [300/3229]  lr: 0.000016  grad_norm: 0.7514 (0.7573)  closs: 1.0594 (1.0468)  time: 4.1070  data: 0.0002  max mem: 54684
+[08:17:38.927633] Epoch: [2]  [310/3229]  lr: 0.000016  grad_norm: 0.7797 (0.7577)  closs: 1.0474 (1.0460)  time: 4.1056  data: 0.0002  max mem: 54684
+[08:18:19.652536] Epoch: [2]  [320/3229]  lr: 0.000016  grad_norm: 0.7461 (0.7563)  closs: 1.0455 (1.0458)  time: 4.0791  data: 0.0002  max mem: 54684
+[08:19:00.882256] Epoch: [2]  [330/3229]  lr: 0.000015  grad_norm: 0.7702 (0.7581)  closs: 1.0466 (1.0463)  time: 4.0977  data: 0.0002  max mem: 54684
+[08:19:40.850985] Epoch: [2]  [340/3229]  lr: 0.000015  grad_norm: 0.7544 (0.7554)  closs: 1.0466 (1.0460)  time: 4.0599  data: 0.0002  max mem: 54684
+[08:20:21.704908] Epoch: [2]  [350/3229]  lr: 0.000015  grad_norm: 0.7372 (0.7563)  closs: 1.0080 (1.0453)  time: 4.0411  data: 0.0002  max mem: 54684
+[08:21:02.268285] Epoch: [2]  [360/3229]  lr: 0.000015  grad_norm: 0.8174 (0.7568)  closs: 0.9763 (1.0430)  time: 4.0708  data: 0.0002  max mem: 54684
+[08:21:42.525189] Epoch: [2]  [370/3229]  lr: 0.000015  grad_norm: 0.7369 (0.7568)  closs: 0.9641 (1.0418)  time: 4.0410  data: 0.0002  max mem: 54684
+[08:22:23.507510] Epoch: [2]  [380/3229]  lr: 0.000015  grad_norm: 0.7944 (0.7578)  closs: 1.0548 (1.0421)  time: 4.0619  data: 0.0002  max mem: 54684
+[08:23:04.788986] Epoch: [2]  [390/3229]  lr: 0.000015  grad_norm: 0.7995 (0.7582)  closs: 1.0502 (1.0420)  time: 4.1131  data: 0.0002  max mem: 54684
+[08:23:45.036107] Epoch: [2]  [400/3229]  lr: 0.000015  grad_norm: 0.7850 (0.7579)  closs: 1.0068 (1.0413)  time: 4.0764  data: 0.0002  max mem: 54684
+[08:24:25.660061] Epoch: [2]  [410/3229]  lr: 0.000015  grad_norm: 0.7850 (0.7576)  closs: 1.0540 (1.0410)  time: 4.0435  data: 0.0002  max mem: 54684
+[08:25:06.015694] Epoch: [2]  [420/3229]  lr: 0.000015  grad_norm: 0.7293 (0.7570)  closs: 1.0468 (1.0396)  time: 4.0489  data: 0.0002  max mem: 54684
+[08:25:46.999427] Epoch: [2]  [430/3229]  lr: 0.000015  grad_norm: 0.7238 (0.7570)  closs: 1.0225 (1.0398)  time: 4.0669  data: 0.0002  max mem: 54684
+[08:26:27.949780] Epoch: [2]  [440/3229]  lr: 0.000015  grad_norm: 0.7509 (0.7575)  closs: 1.0472 (1.0399)  time: 4.0966  data: 0.0002  max mem: 54684
+[08:27:08.849435] Epoch: [2]  [450/3229]  lr: 0.000015  grad_norm: 0.7973 (0.7579)  closs: 1.0547 (1.0396)  time: 4.0924  data: 0.0002  max mem: 54684
+[08:27:49.906838] Epoch: [2]  [460/3229]  lr: 0.000015  grad_norm: 0.7565 (0.7575)  closs: 0.9997 (1.0391)  time: 4.0978  data: 0.0002  max mem: 54684
+[08:28:30.907563] Epoch: [2]  [470/3229]  lr: 0.000015  grad_norm: 0.7846 (0.7587)  closs: 1.0305 (1.0388)  time: 4.1028  data: 0.0002  max mem: 54684
+[08:29:11.501494] Epoch: [2]  [480/3229]  lr: 0.000015  grad_norm: 0.7925 (0.7583)  closs: 1.0507 (1.0382)  time: 4.0797  data: 0.0002  max mem: 54684
+[08:29:52.450393] Epoch: [2]  [490/3229]  lr: 0.000014  grad_norm: 0.7658 (0.7582)  closs: 1.0217 (1.0378)  time: 4.0771  data: 0.0002  max mem: 54684
+[08:30:33.131092] Epoch: [2]  [500/3229]  lr: 0.000014  grad_norm: 0.7658 (0.7582)  closs: 1.0195 (1.0377)  time: 4.0814  data: 0.0002  max mem: 54684
+[08:31:13.731758] Epoch: [2]  [510/3229]  lr: 0.000014  grad_norm: 0.7490 (0.7574)  closs: 1.0083 (1.0372)  time: 4.0640  data: 0.0002  max mem: 54684
+[08:31:54.606631] Epoch: [2]  [520/3229]  lr: 0.000014  grad_norm: 0.7661 (0.7578)  closs: 1.0242 (1.0374)  time: 4.0737  data: 0.0002  max mem: 54684
+[08:32:35.652465] Epoch: [2]  [530/3229]  lr: 0.000014  grad_norm: 0.7661 (0.7574)  closs: 1.0650 (1.0373)  time: 4.0960  data: 0.0002  max mem: 54684
+[08:33:16.811929] Epoch: [2]  [540/3229]  lr: 0.000014  grad_norm: 0.7808 (0.7580)  closs: 1.0536 (1.0379)  time: 4.1102  data: 0.0002  max mem: 54684
+[08:33:58.098430] Epoch: [2]  [550/3229]  lr: 0.000014  grad_norm: 0.7890 (0.7584)  closs: 1.0505 (1.0379)  time: 4.1222  data: 0.0002  max mem: 54684
+[08:34:39.012102] Epoch: [2]  [560/3229]  lr: 0.000014  grad_norm: 0.7536 (0.7579)  closs: 1.0541 (1.0380)  time: 4.1099  data: 0.0002  max mem: 54684
+[08:35:20.411837] Epoch: [2]  [570/3229]  lr: 0.000014  grad_norm: 0.7575 (0.7583)  closs: 1.0281 (1.0376)  time: 4.1156  data: 0.0002  max mem: 54684
+[08:36:01.585232] Epoch: [2]  [580/3229]  lr: 0.000014  grad_norm: 0.7735 (0.7588)  closs: 1.0281 (1.0379)  time: 4.1286  data: 0.0002  max mem: 54684
+[08:36:42.866022] Epoch: [2]  [590/3229]  lr: 0.000014  grad_norm: 0.7978 (0.7597)  closs: 1.0419 (1.0385)  time: 4.1226  data: 0.0002  max mem: 54684
+[08:37:24.134897] Epoch: [2]  [600/3229]  lr: 0.000014  grad_norm: 0.7944 (0.7605)  closs: 1.0606 (1.0390)  time: 4.1274  data: 0.0002  max mem: 54684
+[08:38:05.564805] Epoch: [2]  [610/3229]  lr: 0.000014  grad_norm: 0.7800 (0.7610)  closs: 1.0638 (1.0396)  time: 4.1349  data: 0.0002  max mem: 54684
+[08:38:47.052757] Epoch: [2]  [620/3229]  lr: 0.000014  grad_norm: 0.7830 (0.7618)  closs: 1.0701 (1.0400)  time: 4.1458  data: 0.0002  max mem: 54684
+[08:39:28.070271] Epoch: [2]  [630/3229]  lr: 0.000014  grad_norm: 0.7867 (0.7620)  closs: 1.0398 (1.0394)  time: 4.1252  data: 0.0002  max mem: 54684
+[08:40:09.431657] Epoch: [2]  [640/3229]  lr: 0.000014  grad_norm: 0.7636 (0.7620)  closs: 1.0375 (1.0393)  time: 4.1189  data: 0.0002  max mem: 54684
+[08:40:50.029316] Epoch: [2]  [650/3229]  lr: 0.000013  grad_norm: 0.7721 (0.7620)  closs: 1.0376 (1.0392)  time: 4.0979  data: 0.0002  max mem: 54684
+[08:41:31.253698] Epoch: [2]  [660/3229]  lr: 0.000013  grad_norm: 0.8149 (0.7639)  closs: 1.0376 (1.0390)  time: 4.0910  data: 0.0002  max mem: 54684
+[08:42:11.674957] Epoch: [2]  [670/3229]  lr: 0.000013  grad_norm: 0.7910 (0.7640)  closs: 0.9759 (1.0377)  time: 4.0822  data: 0.0002  max mem: 54684
+[08:42:52.587904] Epoch: [2]  [680/3229]  lr: 0.000013  grad_norm: 0.8076 (0.7644)  closs: 1.0142 (1.0376)  time: 4.0666  data: 0.0002  max mem: 54684
+[08:43:33.204090] Epoch: [2]  [690/3229]  lr: 0.000013  grad_norm: 0.7571 (0.7643)  closs: 1.0510 (1.0373)  time: 4.0764  data: 0.0002  max mem: 54684
+[08:44:13.752622] Epoch: [2]  [700/3229]  lr: 0.000013  grad_norm: 0.7474 (0.7643)  closs: 1.0141 (1.0369)  time: 4.0582  data: 0.0002  max mem: 54684
+[08:44:54.180675] Epoch: [2]  [710/3229]  lr: 0.000013  grad_norm: 0.7605 (0.7641)  closs: 1.0175 (1.0365)  time: 4.0488  data: 0.0002  max mem: 54684
+[08:45:34.801694] Epoch: [2]  [720/3229]  lr: 0.000013  grad_norm: 0.7678 (0.7645)  closs: 1.0230 (1.0360)  time: 4.0524  data: 0.0002  max mem: 54684
+[08:46:15.397922] Epoch: [2]  [730/3229]  lr: 0.000013  grad_norm: 0.7292 (0.7638)  closs: 1.0149 (1.0362)  time: 4.0608  data: 0.0002  max mem: 54684
+[08:46:56.241797] Epoch: [2]  [740/3229]  lr: 0.000013  grad_norm: 0.7162 (0.7634)  closs: 1.0468 (1.0362)  time: 4.0719  data: 0.0002  max mem: 54684
+[08:47:37.401308] Epoch: [2]  [750/3229]  lr: 0.000013  grad_norm: 0.7125 (0.7625)  closs: 1.0468 (1.0361)  time: 4.1001  data: 0.0002  max mem: 54684
+[08:48:17.975003] Epoch: [2]  [760/3229]  lr: 0.000013  grad_norm: 0.7230 (0.7623)  closs: 0.9952 (1.0357)  time: 4.0866  data: 0.0002  max mem: 54684
+[08:48:58.599971] Epoch: [2]  [770/3229]  lr: 0.000013  grad_norm: 0.7391 (0.7618)  closs: 0.9999 (1.0354)  time: 4.0599  data: 0.0002  max mem: 54684
+[08:49:39.723248] Epoch: [2]  [780/3229]  lr: 0.000013  grad_norm: 0.7642 (0.7620)  closs: 1.0226 (1.0358)  time: 4.0873  data: 0.0002  max mem: 54684
+[08:50:20.513740] Epoch: [2]  [790/3229]  lr: 0.000013  grad_norm: 0.7684 (0.7622)  closs: 1.0461 (1.0352)  time: 4.0956  data: 0.0002  max mem: 54684
+[08:51:00.788007] Epoch: [2]  [800/3229]  lr: 0.000013  grad_norm: 0.7667 (0.7621)  closs: 1.0416 (1.0349)  time: 4.0532  data: 0.0002  max mem: 54684
+[08:51:42.084938] Epoch: [2]  [810/3229]  lr: 0.000012  grad_norm: 0.7514 (0.7623)  closs: 1.0492 (1.0351)  time: 4.0785  data: 0.0002  max mem: 54684
+[08:52:22.683852] Epoch: [2]  [820/3229]  lr: 0.000012  grad_norm: 0.7899 (0.7626)  closs: 1.0387 (1.0346)  time: 4.0947  data: 0.0002  max mem: 54684
+[08:53:03.136708] Epoch: [2]  [830/3229]  lr: 0.000012  grad_norm: 0.7319 (0.7623)  closs: 1.0123 (1.0341)  time: 4.0525  data: 0.0002  max mem: 54684
+[08:53:44.061472] Epoch: [2]  [840/3229]  lr: 0.000012  grad_norm: 0.7243 (0.7622)  closs: 1.0273 (1.0344)  time: 4.0688  data: 0.0002  max mem: 54684
+[08:54:24.982996] Epoch: [2]  [850/3229]  lr: 0.000012  grad_norm: 0.7494 (0.7625)  closs: 1.0501 (1.0346)  time: 4.0923  data: 0.0002  max mem: 54684
+[08:55:05.938568] Epoch: [2]  [860/3229]  lr: 0.000012  grad_norm: 0.7728 (0.7625)  closs: 1.0500 (1.0344)  time: 4.0938  data: 0.0002  max mem: 54684
+[08:55:47.280626] Epoch: [2]  [870/3229]  lr: 0.000012  grad_norm: 0.7828 (0.7630)  closs: 1.0542 (1.0347)  time: 4.1148  data: 0.0002  max mem: 54684
+[08:56:27.932374] Epoch: [2]  [880/3229]  lr: 0.000012  grad_norm: 0.7772 (0.7629)  closs: 1.0480 (1.0344)  time: 4.0996  data: 0.0002  max mem: 54684
+[08:57:09.157992] Epoch: [2]  [890/3229]  lr: 0.000012  grad_norm: 0.7710 (0.7629)  closs: 1.0352 (1.0346)  time: 4.0938  data: 0.0002  max mem: 54684
+[08:57:50.048167] Epoch: [2]  [900/3229]  lr: 0.000012  grad_norm: 0.7654 (0.7630)  closs: 1.0552 (1.0348)  time: 4.1057  data: 0.0002  max mem: 54684
+[08:58:31.441896] Epoch: [2]  [910/3229]  lr: 0.000012  grad_norm: 0.7748 (0.7631)  closs: 1.0552 (1.0352)  time: 4.1141  data: 0.0002  max mem: 54684
+[08:59:12.358818] Epoch: [2]  [920/3229]  lr: 0.000012  grad_norm: 0.7834 (0.7633)  closs: 1.0587 (1.0353)  time: 4.1155  data: 0.0002  max mem: 54684
+[08:59:53.384497] Epoch: [2]  [930/3229]  lr: 0.000012  grad_norm: 0.7600 (0.7639)  closs: 1.0610 (1.0358)  time: 4.0971  data: 0.0002  max mem: 54684
+[09:00:34.296386] Epoch: [2]  [940/3229]  lr: 0.000012  grad_norm: 0.7367 (0.7634)  closs: 1.0361 (1.0355)  time: 4.0968  data: 0.0002  max mem: 54684
+[09:01:15.300491] Epoch: [2]  [950/3229]  lr: 0.000012  grad_norm: 0.7532 (0.7637)  closs: 1.0273 (1.0357)  time: 4.0957  data: 0.0002  max mem: 54684
+[09:01:56.214466] Epoch: [2]  [960/3229]  lr: 0.000012  grad_norm: 0.8101 (0.7637)  closs: 1.0385 (1.0358)  time: 4.0958  data: 0.0002  max mem: 54684
+[09:02:37.461375] Epoch: [2]  [970/3229]  lr: 0.000012  grad_norm: 0.7784 (0.7640)  closs: 1.0385 (1.0359)  time: 4.1080  data: 0.0002  max mem: 54684
+[09:03:18.133192] Epoch: [2]  [980/3229]  lr: 0.000012  grad_norm: 0.7784 (0.7641)  closs: 1.0226 (1.0356)  time: 4.0959  data: 0.0002  max mem: 54684
+[09:03:58.519062] Epoch: [2]  [990/3229]  lr: 0.000011  grad_norm: 0.7619 (0.7637)  closs: 1.0127 (1.0353)  time: 4.0528  data: 0.0002  max mem: 54684
+[09:04:39.761660] Epoch: [2]  [1000/3229]  lr: 0.000011  grad_norm: 0.7440 (0.7638)  closs: 1.0497 (1.0358)  time: 4.0814  data: 0.0002  max mem: 54684
+[09:05:20.726355] Epoch: [2]  [1010/3229]  lr: 0.000011  grad_norm: 0.7440 (0.7637)  closs: 1.0649 (1.0361)  time: 4.1103  data: 0.0002  max mem: 54684
+[09:06:00.952110] Epoch: [2]  [1020/3229]  lr: 0.000011  grad_norm: 0.7606 (0.7640)  closs: 1.0374 (1.0356)  time: 4.0595  data: 0.0002  max mem: 54684
+[09:06:41.558460] Epoch: [2]  [1030/3229]  lr: 0.000011  grad_norm: 0.7384 (0.7636)  closs: 1.0224 (1.0354)  time: 4.0415  data: 0.0002  max mem: 54684
+[09:07:22.293637] Epoch: [2]  [1040/3229]  lr: 0.000011  grad_norm: 0.7438 (0.7637)  closs: 1.0008 (1.0349)  time: 4.0670  data: 0.0002  max mem: 54684
+[09:08:03.484133] Epoch: [2]  [1050/3229]  lr: 0.000011  grad_norm: 0.7482 (0.7634)  closs: 1.0428 (1.0352)  time: 4.0962  data: 0.0002  max mem: 54684
+[09:08:44.382701] Epoch: [2]  [1060/3229]  lr: 0.000011  grad_norm: 0.7485 (0.7636)  closs: 1.0560 (1.0352)  time: 4.1044  data: 0.0002  max mem: 54684
+[09:09:25.808204] Epoch: [2]  [1070/3229]  lr: 0.000011  grad_norm: 0.7534 (0.7633)  closs: 1.0265 (1.0352)  time: 4.1161  data: 0.0002  max mem: 54684
+[09:10:06.913588] Epoch: [2]  [1080/3229]  lr: 0.000011  grad_norm: 0.7569 (0.7634)  closs: 1.0402 (1.0353)  time: 4.1265  data: 0.0002  max mem: 54684
+[09:10:47.784050] Epoch: [2]  [1090/3229]  lr: 0.000011  grad_norm: 0.7794 (0.7635)  closs: 1.0821 (1.0357)  time: 4.0987  data: 0.0002  max mem: 54684
+[09:11:29.039121] Epoch: [2]  [1100/3229]  lr: 0.000011  grad_norm: 0.7814 (0.7636)  closs: 1.0679 (1.0358)  time: 4.1062  data: 0.0002  max mem: 54684
+[09:12:10.373961] Epoch: [2]  [1110/3229]  lr: 0.000011  grad_norm: 0.7643 (0.7639)  closs: 1.0508 (1.0360)  time: 4.1294  data: 0.0002  max mem: 54684
+[09:12:51.449473] Epoch: [2]  [1120/3229]  lr: 0.000011  grad_norm: 0.7562 (0.7638)  closs: 1.0466 (1.0358)  time: 4.1205  data: 0.0002  max mem: 54684
+[09:13:32.329462] Epoch: [2]  [1130/3229]  lr: 0.000011  grad_norm: 0.7943 (0.7646)  closs: 1.0470 (1.0359)  time: 4.0977  data: 0.0002  max mem: 54684
+[09:14:12.321437] Epoch: [2]  [1140/3229]  lr: 0.000011  grad_norm: 0.8189 (0.7647)  closs: 1.0094 (1.0354)  time: 4.0435  data: 0.0002  max mem: 54684
+[09:14:53.353944] Epoch: [2]  [1150/3229]  lr: 0.000011  grad_norm: 0.8029 (0.7651)  closs: 0.9823 (1.0353)  time: 4.0512  data: 0.0002  max mem: 54684
+[09:15:34.257366] Epoch: [2]  [1160/3229]  lr: 0.000011  grad_norm: 0.8163 (0.7653)  closs: 1.0455 (1.0354)  time: 4.0967  data: 0.0002  max mem: 54684
+[09:16:14.798544] Epoch: [2]  [1170/3229]  lr: 0.000011  grad_norm: 0.8128 (0.7652)  closs: 1.0318 (1.0353)  time: 4.0722  data: 0.0002  max mem: 54684
+[09:16:55.872961] Epoch: [2]  [1180/3229]  lr: 0.000010  grad_norm: 0.7566 (0.7651)  closs: 1.0318 (1.0353)  time: 4.0807  data: 0.0002  max mem: 54684
+[09:17:36.505833] Epoch: [2]  [1190/3229]  lr: 0.000010  grad_norm: 0.7433 (0.7650)  closs: 1.0522 (1.0355)  time: 4.0853  data: 0.0002  max mem: 54684
+[09:18:17.759241] Epoch: [2]  [1200/3229]  lr: 0.000010  grad_norm: 0.7621 (0.7651)  closs: 1.0598 (1.0356)  time: 4.0942  data: 0.0002  max mem: 54684
+[09:18:59.258398] Epoch: [2]  [1210/3229]  lr: 0.000010  grad_norm: 0.7694 (0.7652)  closs: 1.0676 (1.0360)  time: 4.1376  data: 0.0002  max mem: 54684
+[09:19:40.458187] Epoch: [2]  [1220/3229]  lr: 0.000010  grad_norm: 0.7466 (0.7650)  closs: 1.0545 (1.0359)  time: 4.1349  data: 0.0002  max mem: 54684
+[09:20:21.687691] Epoch: [2]  [1230/3229]  lr: 0.000010  grad_norm: 0.7782 (0.7653)  closs: 1.0469 (1.0362)  time: 4.1214  data: 0.0002  max mem: 54684
+[09:21:02.306360] Epoch: [2]  [1240/3229]  lr: 0.000010  grad_norm: 0.7785 (0.7651)  closs: 1.0469 (1.0361)  time: 4.0923  data: 0.0002  max mem: 54684
+[09:21:43.494418] Epoch: [2]  [1250/3229]  lr: 0.000010  grad_norm: 0.7669 (0.7654)  closs: 1.0804 (1.0366)  time: 4.0903  data: 0.0002  max mem: 54684
+[09:22:24.223450] Epoch: [2]  [1260/3229]  lr: 0.000010  grad_norm: 0.7949 (0.7653)  closs: 1.0789 (1.0365)  time: 4.0958  data: 0.0002  max mem: 54684
+[09:23:05.137178] Epoch: [2]  [1270/3229]  lr: 0.000010  grad_norm: 0.7839 (0.7655)  closs: 1.0398 (1.0366)  time: 4.0821  data: 0.0002  max mem: 54684
+[09:23:45.713305] Epoch: [2]  [1280/3229]  lr: 0.000010  grad_norm: 0.7820 (0.7654)  closs: 1.0345 (1.0364)  time: 4.0744  data: 0.0002  max mem: 54684
+[09:24:26.883899] Epoch: [2]  [1290/3229]  lr: 0.000010  grad_norm: 0.7848 (0.7655)  closs: 1.0622 (1.0368)  time: 4.0873  data: 0.0002  max mem: 54684
+[09:25:07.882033] Epoch: [2]  [1300/3229]  lr: 0.000010  grad_norm: 0.7626 (0.7653)  closs: 1.0601 (1.0370)  time: 4.1084  data: 0.0002  max mem: 54684
+[09:25:48.788547] Epoch: [2]  [1310/3229]  lr: 0.000010  grad_norm: 0.7626 (0.7655)  closs: 1.0317 (1.0368)  time: 4.0952  data: 0.0002  max mem: 54684
+[09:26:29.719803] Epoch: [2]  [1320/3229]  lr: 0.000010  grad_norm: 0.7593 (0.7652)  closs: 1.0151 (1.0369)  time: 4.0918  data: 0.0002  max mem: 54684
+[09:27:10.299624] Epoch: [2]  [1330/3229]  lr: 0.000010  grad_norm: 0.7602 (0.7652)  closs: 1.0215 (1.0365)  time: 4.0755  data: 0.0002  max mem: 54684
+[09:27:50.751310] Epoch: [2]  [1340/3229]  lr: 0.000010  grad_norm: 0.7643 (0.7651)  closs: 0.9875 (1.0362)  time: 4.0515  data: 0.0002  max mem: 54684
+[09:28:31.398595] Epoch: [2]  [1350/3229]  lr: 0.000010  grad_norm: 0.7591 (0.7650)  closs: 0.9875 (1.0358)  time: 4.0549  data: 0.0002  max mem: 54684
+[09:29:12.305782] Epoch: [2]  [1360/3229]  lr: 0.000010  grad_norm: 0.7741 (0.7651)  closs: 1.0085 (1.0361)  time: 4.0777  data: 0.0001  max mem: 54684
+[09:29:53.571306] Epoch: [2]  [1370/3229]  lr: 0.000010  grad_norm: 0.7792 (0.7653)  closs: 1.0583 (1.0362)  time: 4.1086  data: 0.0002  max mem: 54684
+[09:30:34.513851] Epoch: [2]  [1380/3229]  lr: 0.000009  grad_norm: 0.8100 (0.7656)  closs: 1.0583 (1.0366)  time: 4.1103  data: 0.0001  max mem: 54684
+[09:31:15.446793] Epoch: [2]  [1390/3229]  lr: 0.000009  grad_norm: 0.8007 (0.7656)  closs: 1.0600 (1.0367)  time: 4.0937  data: 0.0001  max mem: 54684
+[09:31:56.718293] Epoch: [2]  [1400/3229]  lr: 0.000009  grad_norm: 0.7785 (0.7658)  closs: 1.0538 (1.0368)  time: 4.1101  data: 0.0001  max mem: 54684
+[09:32:37.066477] Epoch: [2]  [1410/3229]  lr: 0.000009  grad_norm: 0.7777 (0.7654)  closs: 1.0459 (1.0365)  time: 4.0809  data: 0.0002  max mem: 54684
+[09:33:18.072485] Epoch: [2]  [1420/3229]  lr: 0.000009  grad_norm: 0.7777 (0.7656)  closs: 1.0269 (1.0366)  time: 4.0676  data: 0.0001  max mem: 54684
+[09:33:58.688418] Epoch: [2]  [1430/3229]  lr: 0.000009  grad_norm: 0.7950 (0.7658)  closs: 1.0548 (1.0367)  time: 4.0810  data: 0.0002  max mem: 54684
+[09:34:40.028938] Epoch: [2]  [1440/3229]  lr: 0.000009  grad_norm: 0.7547 (0.7655)  closs: 1.0359 (1.0365)  time: 4.0977  data: 0.0002  max mem: 54684
+[09:35:20.652994] Epoch: [2]  [1450/3229]  lr: 0.000009  grad_norm: 0.7292 (0.7654)  closs: 1.0334 (1.0363)  time: 4.0982  data: 0.0001  max mem: 54684
+[09:36:00.716699] Epoch: [2]  [1460/3229]  lr: 0.000009  grad_norm: 0.7434 (0.7651)  closs: 0.9868 (1.0360)  time: 4.0343  data: 0.0002  max mem: 54684
+[09:36:41.682870] Epoch: [2]  [1470/3229]  lr: 0.000009  grad_norm: 0.7452 (0.7652)  closs: 0.9904 (1.0359)  time: 4.0514  data: 0.0001  max mem: 54684
+[09:37:22.665178] Epoch: [2]  [1480/3229]  lr: 0.000009  grad_norm: 0.7568 (0.7653)  closs: 1.0227 (1.0358)  time: 4.0974  data: 0.0001  max mem: 54684
+[09:38:03.979174] Epoch: [2]  [1490/3229]  lr: 0.000009  grad_norm: 0.7775 (0.7653)  closs: 1.0474 (1.0360)  time: 4.1147  data: 0.0002  max mem: 54684
+[09:38:45.030747] Epoch: [2]  [1500/3229]  lr: 0.000009  grad_norm: 0.7560 (0.7653)  closs: 1.0474 (1.0360)  time: 4.1182  data: 0.0002  max mem: 54684
+[09:39:25.831863] Epoch: [2]  [1510/3229]  lr: 0.000009  grad_norm: 0.7644 (0.7652)  closs: 1.0295 (1.0358)  time: 4.0926  data: 0.0001  max mem: 54684
+[09:40:06.337863] Epoch: [2]  [1520/3229]  lr: 0.000009  grad_norm: 0.7842 (0.7653)  closs: 1.0512 (1.0360)  time: 4.0653  data: 0.0001  max mem: 54684
+[09:40:46.718752] Epoch: [2]  [1530/3229]  lr: 0.000009  grad_norm: 0.7452 (0.7650)  closs: 1.0411 (1.0359)  time: 4.0443  data: 0.0002  max mem: 54684
+[09:41:27.479284] Epoch: [2]  [1540/3229]  lr: 0.000009  grad_norm: 0.7451 (0.7647)  closs: 1.0411 (1.0359)  time: 4.0570  data: 0.0002  max mem: 54684
+[09:42:08.444014] Epoch: [2]  [1550/3229]  lr: 0.000009  grad_norm: 0.7502 (0.7648)  closs: 1.0511 (1.0361)  time: 4.0862  data: 0.0002  max mem: 54684
+[09:42:49.492626] Epoch: [2]  [1560/3229]  lr: 0.000009  grad_norm: 0.7626 (0.7648)  closs: 1.0354 (1.0359)  time: 4.1006  data: 0.0002  max mem: 54684
+[09:43:30.218498] Epoch: [2]  [1570/3229]  lr: 0.000009  grad_norm: 0.7822 (0.7649)  closs: 1.0210 (1.0362)  time: 4.0886  data: 0.0002  max mem: 54684
+[09:44:11.014656] Epoch: [2]  [1580/3229]  lr: 0.000009  grad_norm: 0.7599 (0.7648)  closs: 1.0582 (1.0360)  time: 4.0760  data: 0.0001  max mem: 54684
+[09:44:52.430295] Epoch: [2]  [1590/3229]  lr: 0.000009  grad_norm: 0.7556 (0.7649)  closs: 1.0126 (1.0362)  time: 4.1105  data: 0.0001  max mem: 54684
+[09:45:32.708025] Epoch: [2]  [1600/3229]  lr: 0.000009  grad_norm: 0.7689 (0.7650)  closs: 1.0061 (1.0360)  time: 4.0846  data: 0.0002  max mem: 54684
+[09:46:13.311218] Epoch: [2]  [1610/3229]  lr: 0.000008  grad_norm: 0.7788 (0.7652)  closs: 1.0035 (1.0361)  time: 4.0440  data: 0.0002  max mem: 54684
+[09:46:55.090230] Epoch: [2]  [1620/3229]  lr: 0.000008  grad_norm: 0.7534 (0.7651)  closs: 1.0395 (1.0361)  time: 4.1190  data: 0.0002  max mem: 54684
+[09:47:36.090209] Epoch: [2]  [1630/3229]  lr: 0.000008  grad_norm: 0.7519 (0.7651)  closs: 1.0562 (1.0361)  time: 4.1389  data: 0.0002  max mem: 54684
+[09:48:17.317549] Epoch: [2]  [1640/3229]  lr: 0.000008  grad_norm: 0.7637 (0.7651)  closs: 1.0442 (1.0362)  time: 4.1113  data: 0.0002  max mem: 54684
+[09:48:58.147416] Epoch: [2]  [1650/3229]  lr: 0.000008  grad_norm: 0.7460 (0.7650)  closs: 1.0442 (1.0361)  time: 4.1028  data: 0.0002  max mem: 54684
+[09:49:38.823915] Epoch: [2]  [1660/3229]  lr: 0.000008  grad_norm: 0.7707 (0.7653)  closs: 0.9922 (1.0360)  time: 4.0752  data: 0.0002  max mem: 54684
+[09:50:20.328398] Epoch: [2]  [1670/3229]  lr: 0.000008  grad_norm: 0.7983 (0.7656)  closs: 1.0671 (1.0362)  time: 4.1090  data: 0.0002  max mem: 54684
+[09:51:01.223284] Epoch: [2]  [1680/3229]  lr: 0.000008  grad_norm: 0.8044 (0.7656)  closs: 1.0319 (1.0360)  time: 4.1199  data: 0.0001  max mem: 54684
+[09:51:41.770661] Epoch: [2]  [1690/3229]  lr: 0.000008  grad_norm: 0.7965 (0.7656)  closs: 1.0218 (1.0362)  time: 4.0720  data: 0.0002  max mem: 54684
+[09:52:23.631344] Epoch: [2]  [1700/3229]  lr: 0.000008  grad_norm: 0.7809 (0.7658)  closs: 1.0307 (1.0362)  time: 4.1203  data: 0.0002  max mem: 54684
+[09:53:03.971107] Epoch: [2]  [1710/3229]  lr: 0.000008  grad_norm: 0.7531 (0.7655)  closs: 1.0542 (1.0362)  time: 4.1099  data: 0.0001  max mem: 54684
+[09:53:44.185439] Epoch: [2]  [1720/3229]  lr: 0.000008  grad_norm: 0.7503 (0.7655)  closs: 1.0655 (1.0362)  time: 4.0276  data: 0.0002  max mem: 54684
+[09:54:25.528421] Epoch: [2]  [1730/3229]  lr: 0.000008  grad_norm: 0.7650 (0.7655)  closs: 1.0793 (1.0364)  time: 4.0778  data: 0.0002  max mem: 54684
+[09:55:06.203734] Epoch: [2]  [1740/3229]  lr: 0.000008  grad_norm: 0.7650 (0.7657)  closs: 1.0568 (1.0362)  time: 4.1008  data: 0.0002  max mem: 54684
+[09:55:46.816613] Epoch: [2]  [1750/3229]  lr: 0.000008  grad_norm: 0.7542 (0.7655)  closs: 1.0362 (1.0364)  time: 4.0644  data: 0.0002  max mem: 54684
+[09:56:27.712286] Epoch: [2]  [1760/3229]  lr: 0.000008  grad_norm: 0.7409 (0.7656)  closs: 1.0798 (1.0365)  time: 4.0754  data: 0.0002  max mem: 54684
+[09:57:09.011822] Epoch: [2]  [1770/3229]  lr: 0.000008  grad_norm: 0.7625 (0.7657)  closs: 1.0369 (1.0366)  time: 4.1097  data: 0.0002  max mem: 54684
+[09:57:50.358048] Epoch: [2]  [1780/3229]  lr: 0.000008  grad_norm: 0.7780 (0.7660)  closs: 1.0474 (1.0366)  time: 4.1322  data: 0.0002  max mem: 54684
+[09:58:30.935480] Epoch: [2]  [1790/3229]  lr: 0.000008  grad_norm: 0.7607 (0.7658)  closs: 1.0474 (1.0366)  time: 4.0961  data: 0.0002  max mem: 54684
+[09:59:12.191909] Epoch: [2]  [1800/3229]  lr: 0.000008  grad_norm: 0.7704 (0.7660)  closs: 1.0390 (1.0366)  time: 4.0916  data: 0.0002  max mem: 54684
+[09:59:53.188925] Epoch: [2]  [1810/3229]  lr: 0.000008  grad_norm: 0.7880 (0.7661)  closs: 1.0390 (1.0365)  time: 4.1126  data: 0.0002  max mem: 54684
+[10:00:34.288625] Epoch: [2]  [1820/3229]  lr: 0.000008  grad_norm: 0.8177 (0.7663)  closs: 1.0470 (1.0366)  time: 4.1048  data: 0.0002  max mem: 54684
+[10:01:15.216189] Epoch: [2]  [1830/3229]  lr: 0.000008  grad_norm: 0.7464 (0.7662)  closs: 1.0408 (1.0365)  time: 4.1013  data: 0.0002  max mem: 54684
+[10:01:56.484523] Epoch: [2]  [1840/3229]  lr: 0.000008  grad_norm: 0.7634 (0.7664)  closs: 1.0408 (1.0367)  time: 4.1097  data: 0.0002  max mem: 54684
+[10:02:36.820581] Epoch: [2]  [1850/3229]  lr: 0.000008  grad_norm: 0.7761 (0.7663)  closs: 1.0366 (1.0366)  time: 4.0802  data: 0.0002  max mem: 54684
+[10:03:17.820103] Epoch: [2]  [1860/3229]  lr: 0.000007  grad_norm: 0.7713 (0.7665)  closs: 1.0248 (1.0365)  time: 4.0667  data: 0.0002  max mem: 54684
+[10:03:58.463854] Epoch: [2]  [1870/3229]  lr: 0.000007  grad_norm: 0.7713 (0.7666)  closs: 1.0134 (1.0362)  time: 4.0821  data: 0.0002  max mem: 54684
+[10:04:39.335047] Epoch: [2]  [1880/3229]  lr: 0.000007  grad_norm: 0.7584 (0.7665)  closs: 1.0266 (1.0362)  time: 4.0757  data: 0.0002  max mem: 54684
+[10:05:20.773148] Epoch: [2]  [1890/3229]  lr: 0.000007  grad_norm: 0.7505 (0.7666)  closs: 1.0527 (1.0364)  time: 4.1154  data: 0.0002  max mem: 54684
+[10:06:01.774211] Epoch: [2]  [1900/3229]  lr: 0.000007  grad_norm: 0.7830 (0.7667)  closs: 1.0807 (1.0365)  time: 4.1219  data: 0.0002  max mem: 54684
+[10:06:43.015666] Epoch: [2]  [1910/3229]  lr: 0.000007  grad_norm: 0.7959 (0.7668)  closs: 1.0580 (1.0366)  time: 4.1121  data: 0.0002  max mem: 54684
+[10:07:22.898606] Epoch: [2]  [1920/3229]  lr: 0.000007  grad_norm: 0.7359 (0.7665)  closs: 1.0269 (1.0364)  time: 4.0562  data: 0.0002  max mem: 54684
+[10:08:03.733126] Epoch: [2]  [1930/3229]  lr: 0.000007  grad_norm: 0.7359 (0.7667)  closs: 1.0022 (1.0362)  time: 4.0358  data: 0.0002  max mem: 54684
+[10:08:44.009814] Epoch: [2]  [1940/3229]  lr: 0.000007  grad_norm: 0.7834 (0.7665)  closs: 1.0467 (1.0361)  time: 4.0555  data: 0.0002  max mem: 54684
+[10:09:24.934480] Epoch: [2]  [1950/3229]  lr: 0.000007  grad_norm: 0.7408 (0.7665)  closs: 1.0467 (1.0360)  time: 4.0600  data: 0.0002  max mem: 54684
+[10:10:05.513930] Epoch: [2]  [1960/3229]  lr: 0.000007  grad_norm: 0.7564 (0.7664)  closs: 1.0360 (1.0360)  time: 4.0751  data: 0.0002  max mem: 54684
+[10:10:47.318139] Epoch: [2]  [1970/3229]  lr: 0.000007  grad_norm: 0.7470 (0.7664)  closs: 1.0260 (1.0361)  time: 4.1191  data: 0.0002  max mem: 54684
+[10:11:28.590511] Epoch: [2]  [1980/3229]  lr: 0.000007  grad_norm: 0.7601 (0.7667)  closs: 1.0797 (1.0363)  time: 4.1538  data: 0.0002  max mem: 54684
+[10:12:09.496980] Epoch: [2]  [1990/3229]  lr: 0.000007  grad_norm: 0.7790 (0.7667)  closs: 1.0679 (1.0363)  time: 4.1089  data: 0.0002  max mem: 54684
+[10:12:50.748606] Epoch: [2]  [2000/3229]  lr: 0.000007  grad_norm: 0.7970 (0.7670)  closs: 1.0297 (1.0364)  time: 4.1078  data: 0.0002  max mem: 54684
+[10:13:32.182612] Epoch: [2]  [2010/3229]  lr: 0.000007  grad_norm: 0.8054 (0.7671)  closs: 1.0496 (1.0365)  time: 4.1342  data: 0.0002  max mem: 54684
+[10:14:13.410630] Epoch: [2]  [2020/3229]  lr: 0.000007  grad_norm: 0.7780 (0.7672)  closs: 1.0542 (1.0367)  time: 4.1330  data: 0.0002  max mem: 54684
+[10:14:54.361523] Epoch: [2]  [2030/3229]  lr: 0.000007  grad_norm: 0.8040 (0.7677)  closs: 1.0542 (1.0368)  time: 4.1089  data: 0.0002  max mem: 54684
+[10:15:35.931648] Epoch: [2]  [2040/3229]  lr: 0.000007  grad_norm: 0.8409 (0.7681)  closs: 1.0486 (1.0369)  time: 4.1260  data: 0.0002  max mem: 54684
+[10:16:16.746598] Epoch: [2]  [2050/3229]  lr: 0.000007  grad_norm: 0.7987 (0.7682)  closs: 1.0486 (1.0368)  time: 4.1192  data: 0.0002  max mem: 54684
+[10:16:58.012617] Epoch: [2]  [2060/3229]  lr: 0.000007  grad_norm: 0.8159 (0.7685)  closs: 1.0473 (1.0369)  time: 4.1040  data: 0.0002  max mem: 54684
+[10:17:38.676913] Epoch: [2]  [2070/3229]  lr: 0.000007  grad_norm: 0.7728 (0.7682)  closs: 1.0073 (1.0366)  time: 4.0965  data: 0.0002  max mem: 54684
+[10:18:20.035619] Epoch: [2]  [2080/3229]  lr: 0.000007  grad_norm: 0.7435 (0.7684)  closs: 1.0386 (1.0368)  time: 4.1011  data: 0.0002  max mem: 54684
+[10:19:01.109902] Epoch: [2]  [2090/3229]  lr: 0.000007  grad_norm: 0.7514 (0.7683)  closs: 1.0462 (1.0367)  time: 4.1216  data: 0.0002  max mem: 54684
+[10:19:42.145840] Epoch: [2]  [2100/3229]  lr: 0.000007  grad_norm: 0.7514 (0.7683)  closs: 1.0461 (1.0368)  time: 4.1054  data: 0.0002  max mem: 54684
+[10:20:21.912487] Epoch: [2]  [2110/3229]  lr: 0.000007  grad_norm: 0.7724 (0.7682)  closs: 1.0394 (1.0367)  time: 4.0401  data: 0.0002  max mem: 54684
+[10:21:02.609058] Epoch: [2]  [2120/3229]  lr: 0.000007  grad_norm: 0.7465 (0.7681)  closs: 1.0394 (1.0367)  time: 4.0231  data: 0.0002  max mem: 54684
+[10:21:43.356036] Epoch: [2]  [2130/3229]  lr: 0.000007  grad_norm: 0.7602 (0.7681)  closs: 1.0324 (1.0366)  time: 4.0721  data: 0.0002  max mem: 54684
+[10:22:24.153129] Epoch: [2]  [2140/3229]  lr: 0.000007  grad_norm: 0.7624 (0.7679)  closs: 1.0118 (1.0365)  time: 4.0771  data: 0.0002  max mem: 54684
+[10:23:04.987144] Epoch: [2]  [2150/3229]  lr: 0.000007  grad_norm: 0.7632 (0.7680)  closs: 1.0690 (1.0368)  time: 4.0815  data: 0.0002  max mem: 54684
+[10:23:46.301006] Epoch: [2]  [2160/3229]  lr: 0.000007  grad_norm: 0.7924 (0.7682)  closs: 1.0757 (1.0370)  time: 4.1073  data: 0.0002  max mem: 54684
+[10:24:27.430902] Epoch: [2]  [2170/3229]  lr: 0.000007  grad_norm: 0.7768 (0.7682)  closs: 1.0597 (1.0370)  time: 4.1221  data: 0.0002  max mem: 54684
+[10:25:08.536310] Epoch: [2]  [2180/3229]  lr: 0.000006  grad_norm: 0.7764 (0.7683)  closs: 1.0400 (1.0370)  time: 4.1117  data: 0.0002  max mem: 54684
+[10:25:49.054789] Epoch: [2]  [2190/3229]  lr: 0.000006  grad_norm: 0.7764 (0.7683)  closs: 1.0429 (1.0370)  time: 4.0811  data: 0.0002  max mem: 54684
+[10:26:28.974982] Epoch: [2]  [2200/3229]  lr: 0.000006  grad_norm: 0.7571 (0.7680)  closs: 0.9973 (1.0367)  time: 4.0219  data: 0.0002  max mem: 54684
+[10:27:10.250973] Epoch: [2]  [2210/3229]  lr: 0.000006  grad_norm: 0.7298 (0.7679)  closs: 1.0033 (1.0367)  time: 4.0597  data: 0.0002  max mem: 54684
+[10:27:51.496280] Epoch: [2]  [2220/3229]  lr: 0.000006  grad_norm: 0.7623 (0.7681)  closs: 1.0618 (1.0369)  time: 4.1260  data: 0.0002  max mem: 54684
+[10:28:32.694107] Epoch: [2]  [2230/3229]  lr: 0.000006  grad_norm: 0.8018 (0.7682)  closs: 1.1010 (1.0371)  time: 4.1221  data: 0.0002  max mem: 54684
+[10:29:13.690626] Epoch: [2]  [2240/3229]  lr: 0.000006  grad_norm: 0.7677 (0.7682)  closs: 1.0752 (1.0372)  time: 4.1097  data: 0.0002  max mem: 54684
+[10:29:55.103277] Epoch: [2]  [2250/3229]  lr: 0.000006  grad_norm: 0.7532 (0.7683)  closs: 1.1006 (1.0373)  time: 4.1204  data: 0.0002  max mem: 54684
+[10:30:36.017463] Epoch: [2]  [2260/3229]  lr: 0.000006  grad_norm: 0.7805 (0.7683)  closs: 1.0913 (1.0374)  time: 4.1163  data: 0.0002  max mem: 54684
+[10:31:17.232848] Epoch: [2]  [2270/3229]  lr: 0.000006  grad_norm: 0.7978 (0.7685)  closs: 1.0719 (1.0376)  time: 4.1064  data: 0.0002  max mem: 54684
+[10:31:57.554662] Epoch: [2]  [2280/3229]  lr: 0.000006  grad_norm: 0.7693 (0.7683)  closs: 1.0401 (1.0375)  time: 4.0768  data: 0.0002  max mem: 54684
+[10:32:38.982581] Epoch: [2]  [2290/3229]  lr: 0.000006  grad_norm: 0.7707 (0.7684)  closs: 1.0389 (1.0375)  time: 4.0874  data: 0.0002  max mem: 54684
+[10:33:19.900400] Epoch: [2]  [2300/3229]  lr: 0.000006  grad_norm: 0.7707 (0.7684)  closs: 1.0546 (1.0376)  time: 4.1172  data: 0.0002  max mem: 54684
+[10:34:00.422956] Epoch: [2]  [2310/3229]  lr: 0.000006  grad_norm: 0.7404 (0.7683)  closs: 1.0432 (1.0375)  time: 4.0720  data: 0.0002  max mem: 54684
+[10:34:41.838857] Epoch: [2]  [2320/3229]  lr: 0.000006  grad_norm: 0.7949 (0.7686)  closs: 1.0168 (1.0374)  time: 4.0969  data: 0.0002  max mem: 54684
+[10:35:22.205090] Epoch: [2]  [2330/3229]  lr: 0.000006  grad_norm: 0.7612 (0.7684)  closs: 1.0087 (1.0372)  time: 4.0890  data: 0.0002  max mem: 54684
+[10:36:03.158561] Epoch: [2]  [2340/3229]  lr: 0.000006  grad_norm: 0.7306 (0.7685)  closs: 1.0268 (1.0373)  time: 4.0659  data: 0.0002  max mem: 54684
+[10:36:44.005255] Epoch: [2]  [2350/3229]  lr: 0.000006  grad_norm: 0.7974 (0.7686)  closs: 1.0339 (1.0373)  time: 4.0899  data: 0.0002  max mem: 54684
+[10:37:24.060589] Epoch: [2]  [2360/3229]  lr: 0.000006  grad_norm: 0.7845 (0.7683)  closs: 0.9994 (1.0371)  time: 4.0450  data: 0.0002  max mem: 54684
+[10:38:04.953222] Epoch: [2]  [2370/3229]  lr: 0.000006  grad_norm: 0.7452 (0.7684)  closs: 0.9928 (1.0371)  time: 4.0473  data: 0.0002  max mem: 54684
+[10:38:45.575555] Epoch: [2]  [2380/3229]  lr: 0.000006  grad_norm: 0.7639 (0.7682)  closs: 1.0037 (1.0371)  time: 4.0757  data: 0.0002  max mem: 54684
+[10:39:26.779560] Epoch: [2]  [2390/3229]  lr: 0.000006  grad_norm: 0.7854 (0.7685)  closs: 1.0731 (1.0372)  time: 4.0913  data: 0.0002  max mem: 54684
+[10:40:07.277159] Epoch: [2]  [2400/3229]  lr: 0.000006  grad_norm: 0.7837 (0.7683)  closs: 1.0746 (1.0371)  time: 4.0850  data: 0.0002  max mem: 54684
+[10:40:47.656246] Epoch: [2]  [2410/3229]  lr: 0.000006  grad_norm: 0.7215 (0.7681)  closs: 1.0545 (1.0370)  time: 4.0438  data: 0.0002  max mem: 54684
+[10:41:28.261216] Epoch: [2]  [2420/3229]  lr: 0.000006  grad_norm: 0.7847 (0.7682)  closs: 1.0446 (1.0370)  time: 4.0491  data: 0.0002  max mem: 54684
+[10:42:09.186454] Epoch: [2]  [2430/3229]  lr: 0.000006  grad_norm: 0.7847 (0.7682)  closs: 1.0480 (1.0370)  time: 4.0764  data: 0.0002  max mem: 54684
+[10:42:50.581684] Epoch: [2]  [2440/3229]  lr: 0.000006  grad_norm: 0.7911 (0.7684)  closs: 1.0551 (1.0370)  time: 4.1160  data: 0.0002  max mem: 54684
+[10:43:31.660191] Epoch: [2]  [2450/3229]  lr: 0.000006  grad_norm: 0.7996 (0.7685)  closs: 1.0322 (1.0370)  time: 4.1236  data: 0.0002  max mem: 54684
+[10:44:12.906840] Epoch: [2]  [2460/3229]  lr: 0.000006  grad_norm: 0.7953 (0.7685)  closs: 1.0322 (1.0370)  time: 4.1162  data: 0.0002  max mem: 54684
+[10:44:54.081525] Epoch: [2]  [2470/3229]  lr: 0.000006  grad_norm: 0.7730 (0.7686)  closs: 1.0670 (1.0371)  time: 4.1210  data: 0.0002  max mem: 54684
+[10:45:35.195478] Epoch: [2]  [2480/3229]  lr: 0.000006  grad_norm: 0.7885 (0.7687)  closs: 1.0536 (1.0371)  time: 4.1144  data: 0.0002  max mem: 54684
+[10:46:16.528666] Epoch: [2]  [2490/3229]  lr: 0.000006  grad_norm: 0.7880 (0.7687)  closs: 1.0536 (1.0373)  time: 4.1223  data: 0.0002  max mem: 54684
+[10:46:56.834636] Epoch: [2]  [2500/3229]  lr: 0.000006  grad_norm: 0.7751 (0.7687)  closs: 1.0654 (1.0372)  time: 4.0819  data: 0.0002  max mem: 54684
+[10:47:38.101086] Epoch: [2]  [2510/3229]  lr: 0.000006  grad_norm: 0.7705 (0.7688)  closs: 1.0349 (1.0373)  time: 4.0786  data: 0.0002  max mem: 54684
+[10:48:18.776789] Epoch: [2]  [2520/3229]  lr: 0.000006  grad_norm: 0.7878 (0.7688)  closs: 1.0416 (1.0373)  time: 4.0970  data: 0.0002  max mem: 54684
+[10:48:59.471432] Epoch: [2]  [2530/3229]  lr: 0.000006  grad_norm: 0.7899 (0.7688)  closs: 1.0064 (1.0373)  time: 4.0685  data: 0.0002  max mem: 54684
+[10:49:40.706118] Epoch: [2]  [2540/3229]  lr: 0.000006  grad_norm: 0.7632 (0.7690)  closs: 0.9999 (1.0372)  time: 4.0964  data: 0.0002  max mem: 54684
+[10:50:21.879580] Epoch: [2]  [2550/3229]  lr: 0.000006  grad_norm: 0.8102 (0.7692)  closs: 1.0065 (1.0372)  time: 4.1203  data: 0.0002  max mem: 54684
+[10:51:03.280777] Epoch: [2]  [2560/3229]  lr: 0.000006  grad_norm: 0.7923 (0.7692)  closs: 1.0337 (1.0373)  time: 4.1287  data: 0.0002  max mem: 54684
+[10:51:44.513039] Epoch: [2]  [2570/3229]  lr: 0.000006  grad_norm: 0.7990 (0.7697)  closs: 1.0381 (1.0373)  time: 4.1316  data: 0.0002  max mem: 54684
+[10:52:25.106616] Epoch: [2]  [2580/3229]  lr: 0.000006  grad_norm: 0.8055 (0.7697)  closs: 1.0194 (1.0372)  time: 4.0912  data: 0.0002  max mem: 54684
+[10:53:05.456512] Epoch: [2]  [2590/3229]  lr: 0.000006  grad_norm: 0.7706 (0.7696)  closs: 1.0125 (1.0371)  time: 4.0471  data: 0.0002  max mem: 54684
+[10:53:46.140129] Epoch: [2]  [2600/3229]  lr: 0.000006  grad_norm: 0.7412 (0.7694)  closs: 1.0323 (1.0371)  time: 4.0516  data: 0.0002  max mem: 54684
+[10:54:27.649177] Epoch: [2]  [2610/3229]  lr: 0.000006  grad_norm: 0.7625 (0.7696)  closs: 1.0566 (1.0371)  time: 4.1096  data: 0.0002  max mem: 54684
+[10:55:08.432178] Epoch: [2]  [2620/3229]  lr: 0.000006  grad_norm: 0.8024 (0.7697)  closs: 1.0176 (1.0371)  time: 4.1145  data: 0.0002  max mem: 54684
+[10:55:49.037158] Epoch: [2]  [2630/3229]  lr: 0.000005  grad_norm: 0.7726 (0.7697)  closs: 1.0176 (1.0369)  time: 4.0693  data: 0.0002  max mem: 54684
+[10:56:30.399477] Epoch: [2]  [2640/3229]  lr: 0.000005  grad_norm: 0.7389 (0.7696)  closs: 1.0279 (1.0371)  time: 4.0983  data: 0.0002  max mem: 54684
+[10:57:11.914088] Epoch: [2]  [2650/3229]  lr: 0.000005  grad_norm: 0.7500 (0.7697)  closs: 1.0760 (1.0372)  time: 4.1438  data: 0.0002  max mem: 54684
+[10:57:53.051298] Epoch: [2]  [2660/3229]  lr: 0.000005  grad_norm: 0.7937 (0.7698)  closs: 1.0704 (1.0373)  time: 4.1325  data: 0.0002  max mem: 54684
+[10:58:34.322783] Epoch: [2]  [2670/3229]  lr: 0.000005  grad_norm: 0.8010 (0.7699)  closs: 1.0704 (1.0374)  time: 4.1204  data: 0.0002  max mem: 54684
+[10:59:15.014004] Epoch: [2]  [2680/3229]  lr: 0.000005  grad_norm: 0.8010 (0.7699)  closs: 1.0494 (1.0374)  time: 4.0981  data: 0.0002  max mem: 54684
+[10:59:55.439695] Epoch: [2]  [2690/3229]  lr: 0.000005  grad_norm: 0.7508 (0.7698)  closs: 1.0343 (1.0373)  time: 4.0558  data: 0.0002  max mem: 54684
+[11:00:36.264761] Epoch: [2]  [2700/3229]  lr: 0.000005  grad_norm: 0.7522 (0.7699)  closs: 1.0379 (1.0374)  time: 4.0625  data: 0.0002  max mem: 54684
+[11:01:17.279401] Epoch: [2]  [2710/3229]  lr: 0.000005  grad_norm: 0.7584 (0.7698)  closs: 1.0813 (1.0375)  time: 4.0919  data: 0.0002  max mem: 54684
+[11:01:58.168258] Epoch: [2]  [2720/3229]  lr: 0.000005  grad_norm: 0.7584 (0.7697)  closs: 1.0343 (1.0375)  time: 4.0951  data: 0.0002  max mem: 54684
+[11:02:39.456366] Epoch: [2]  [2730/3229]  lr: 0.000005  grad_norm: 0.7529 (0.7697)  closs: 1.0343 (1.0375)  time: 4.1088  data: 0.0002  max mem: 54684
+[11:03:20.315865] Epoch: [2]  [2740/3229]  lr: 0.000005  grad_norm: 0.7555 (0.7697)  closs: 1.0462 (1.0375)  time: 4.1073  data: 0.0002  max mem: 54684
+[11:04:02.063978] Epoch: [2]  [2750/3229]  lr: 0.000005  grad_norm: 0.7999 (0.7700)  closs: 1.0785 (1.0377)  time: 4.1303  data: 0.0002  max mem: 54684
+[11:04:42.801143] Epoch: [2]  [2760/3229]  lr: 0.000005  grad_norm: 0.7942 (0.7699)  closs: 1.0842 (1.0378)  time: 4.1242  data: 0.0002  max mem: 54684
+[11:05:23.770788] Epoch: [2]  [2770/3229]  lr: 0.000005  grad_norm: 0.7815 (0.7700)  closs: 1.0259 (1.0377)  time: 4.0853  data: 0.0002  max mem: 54684
+[11:06:04.580722] Epoch: [2]  [2780/3229]  lr: 0.000005  grad_norm: 0.7815 (0.7699)  closs: 1.0562 (1.0378)  time: 4.0889  data: 0.0002  max mem: 54684
+[11:06:45.682994] Epoch: [2]  [2790/3229]  lr: 0.000005  grad_norm: 0.7723 (0.7699)  closs: 1.0843 (1.0379)  time: 4.0955  data: 0.0002  max mem: 54684
+[11:07:26.664509] Epoch: [2]  [2800/3229]  lr: 0.000005  grad_norm: 0.7851 (0.7700)  closs: 1.0863 (1.0381)  time: 4.1041  data: 0.0002  max mem: 54684
+[11:08:07.660594] Epoch: [2]  [2810/3229]  lr: 0.000005  grad_norm: 0.7649 (0.7699)  closs: 1.0497 (1.0380)  time: 4.0988  data: 0.0002  max mem: 54684
+[11:08:48.441366] Epoch: [2]  [2820/3229]  lr: 0.000005  grad_norm: 0.7366 (0.7698)  closs: 1.0300 (1.0380)  time: 4.0888  data: 0.0002  max mem: 54684
+[11:09:29.171313] Epoch: [2]  [2830/3229]  lr: 0.000005  grad_norm: 0.7514 (0.7698)  closs: 1.0300 (1.0378)  time: 4.0755  data: 0.0002  max mem: 54684
+[11:10:09.985836] Epoch: [2]  [2840/3229]  lr: 0.000005  grad_norm: 0.7665 (0.7697)  closs: 0.9997 (1.0377)  time: 4.0772  data: 0.0002  max mem: 54684
+[11:10:51.230047] Epoch: [2]  [2850/3229]  lr: 0.000005  grad_norm: 0.7750 (0.7698)  closs: 1.0333 (1.0378)  time: 4.1029  data: 0.0002  max mem: 54684
+[11:11:31.685949] Epoch: [2]  [2860/3229]  lr: 0.000005  grad_norm: 0.7660 (0.7697)  closs: 1.0579 (1.0378)  time: 4.0849  data: 0.0002  max mem: 54684
+[11:12:12.974896] Epoch: [2]  [2870/3229]  lr: 0.000005  grad_norm: 0.7565 (0.7698)  closs: 1.0466 (1.0378)  time: 4.0872  data: 0.0002  max mem: 54684
+[11:12:54.373299] Epoch: [2]  [2880/3229]  lr: 0.000005  grad_norm: 0.7817 (0.7699)  closs: 1.0407 (1.0379)  time: 4.1343  data: 0.0002  max mem: 54684
+[11:13:35.283572] Epoch: [2]  [2890/3229]  lr: 0.000005  grad_norm: 0.7849 (0.7698)  closs: 1.0329 (1.0377)  time: 4.1154  data: 0.0002  max mem: 54684
+[11:14:16.118014] Epoch: [2]  [2900/3229]  lr: 0.000005  grad_norm: 0.7950 (0.7700)  closs: 1.0221 (1.0376)  time: 4.0872  data: 0.0002  max mem: 54684
+[11:14:57.161039] Epoch: [2]  [2910/3229]  lr: 0.000005  grad_norm: 0.8078 (0.7701)  closs: 1.0416 (1.0376)  time: 4.0938  data: 0.0002  max mem: 54684
+[11:15:38.521781] Epoch: [2]  [2920/3229]  lr: 0.000005  grad_norm: 0.7894 (0.7701)  closs: 1.0502 (1.0377)  time: 4.1201  data: 0.0002  max mem: 54684
+[11:16:19.106288] Epoch: [2]  [2930/3229]  lr: 0.000005  grad_norm: 0.7824 (0.7701)  closs: 1.0595 (1.0377)  time: 4.0972  data: 0.0002  max mem: 54684
+[11:17:00.252881] Epoch: [2]  [2940/3229]  lr: 0.000005  grad_norm: 0.7871 (0.7701)  closs: 1.0469 (1.0376)  time: 4.0865  data: 0.0002  max mem: 54684
+[11:17:40.642013] Epoch: [2]  [2950/3229]  lr: 0.000005  grad_norm: 0.7919 (0.7701)  closs: 1.0446 (1.0375)  time: 4.0767  data: 0.0002  max mem: 54684
+[11:18:20.871868] Epoch: [2]  [2960/3229]  lr: 0.000005  grad_norm: 0.7919 (0.7701)  closs: 0.9901 (1.0374)  time: 4.0309  data: 0.0002  max mem: 54684
+[11:19:01.539668] Epoch: [2]  [2970/3229]  lr: 0.000005  grad_norm: 0.8446 (0.7702)  closs: 1.0398 (1.0373)  time: 4.0448  data: 0.0002  max mem: 54684
+[11:19:42.663674] Epoch: [2]  [2980/3229]  lr: 0.000005  grad_norm: 0.7707 (0.7702)  closs: 1.0450 (1.0374)  time: 4.0895  data: 0.0002  max mem: 54684
+[11:20:23.730795] Epoch: [2]  [2990/3229]  lr: 0.000005  grad_norm: 0.7557 (0.7702)  closs: 1.0745 (1.0375)  time: 4.1095  data: 0.0002  max mem: 54684
+[11:21:04.740958] Epoch: [2]  [3000/3229]  lr: 0.000005  grad_norm: 0.7990 (0.7704)  closs: 1.0864 (1.0375)  time: 4.1038  data: 0.0002  max mem: 54684
+[11:21:45.020285] Epoch: [2]  [3010/3229]  lr: 0.000005  grad_norm: 0.8011 (0.7705)  closs: 0.9966 (1.0374)  time: 4.0644  data: 0.0002  max mem: 54684
+[11:22:25.881577] Epoch: [2]  [3020/3229]  lr: 0.000005  grad_norm: 0.7592 (0.7705)  closs: 1.0149 (1.0375)  time: 4.0570  data: 0.0002  max mem: 54684
+[11:23:07.305216] Epoch: [2]  [3030/3229]  lr: 0.000005  grad_norm: 0.7607 (0.7706)  closs: 1.0597 (1.0376)  time: 4.1142  data: 0.0002  max mem: 54684
+[11:23:48.348449] Epoch: [2]  [3040/3229]  lr: 0.000005  grad_norm: 0.7679 (0.7705)  closs: 1.0597 (1.0375)  time: 4.1233  data: 0.0002  max mem: 54684
+[11:24:29.596151] Epoch: [2]  [3050/3229]  lr: 0.000005  grad_norm: 0.7677 (0.7705)  closs: 1.0172 (1.0375)  time: 4.1145  data: 0.0002  max mem: 54684
+[11:25:10.442041] Epoch: [2]  [3060/3229]  lr: 0.000005  grad_norm: 0.7748 (0.7706)  closs: 1.0172 (1.0375)  time: 4.1046  data: 0.0002  max mem: 54684
+[11:25:51.521912] Epoch: [2]  [3070/3229]  lr: 0.000005  grad_norm: 0.8210 (0.7707)  closs: 1.0409 (1.0374)  time: 4.0962  data: 0.0002  max mem: 54684
+[11:26:32.713656] Epoch: [2]  [3080/3229]  lr: 0.000005  grad_norm: 0.7547 (0.7706)  closs: 1.0294 (1.0373)  time: 4.1135  data: 0.0002  max mem: 54684
+[11:27:13.511213] Epoch: [2]  [3090/3229]  lr: 0.000005  grad_norm: 0.7538 (0.7706)  closs: 1.0294 (1.0374)  time: 4.0994  data: 0.0002  max mem: 54684
+[11:27:54.680369] Epoch: [2]  [3100/3229]  lr: 0.000005  grad_norm: 0.7631 (0.7707)  closs: 1.0731 (1.0376)  time: 4.0983  data: 0.0002  max mem: 54684
+[11:28:35.049341] Epoch: [2]  [3110/3229]  lr: 0.000005  grad_norm: 0.7713 (0.7705)  closs: 1.0357 (1.0373)  time: 4.0768  data: 0.0002  max mem: 54684
+[11:29:15.418890] Epoch: [2]  [3120/3229]  lr: 0.000005  grad_norm: 0.7078 (0.7705)  closs: 1.0218 (1.0374)  time: 4.0369  data: 0.0002  max mem: 54684
+[11:29:56.061537] Epoch: [2]  [3130/3229]  lr: 0.000005  grad_norm: 0.7244 (0.7703)  closs: 1.0440 (1.0374)  time: 4.0505  data: 0.0002  max mem: 54684
+[11:30:36.293253] Epoch: [2]  [3140/3229]  lr: 0.000005  grad_norm: 0.7669 (0.7705)  closs: 1.0339 (1.0373)  time: 4.0437  data: 0.0002  max mem: 54684
+[11:31:16.325807] Epoch: [2]  [3150/3229]  lr: 0.000005  grad_norm: 0.7848 (0.7703)  closs: 1.0084 (1.0372)  time: 4.0131  data: 0.0004  max mem: 54684
+[11:31:57.799087] Epoch: [2]  [3160/3229]  lr: 0.000005  grad_norm: 0.7618 (0.7704)  closs: 1.0084 (1.0372)  time: 4.0752  data: 0.0003  max mem: 54684
+[11:32:38.264034] Epoch: [2]  [3170/3229]  lr: 0.000005  grad_norm: 0.7903 (0.7705)  closs: 1.0556 (1.0371)  time: 4.0968  data: 0.0002  max mem: 54684
+[11:33:18.885238] Epoch: [2]  [3180/3229]  lr: 0.000005  grad_norm: 0.7486 (0.7705)  closs: 1.0379 (1.0372)  time: 4.0542  data: 0.0002  max mem: 54684
+[11:33:59.955564] Epoch: [2]  [3190/3229]  lr: 0.000005  grad_norm: 0.7458 (0.7706)  closs: 1.0379 (1.0372)  time: 4.0845  data: 0.0002  max mem: 54684
+[11:34:41.295405] Epoch: [2]  [3200/3229]  lr: 0.000005  grad_norm: 0.7953 (0.7707)  closs: 1.0546 (1.0373)  time: 4.1204  data: 0.0002  max mem: 54684
+[11:35:22.419516] Epoch: [2]  [3210/3229]  lr: 0.000005  grad_norm: 0.7931 (0.7707)  closs: 1.0572 (1.0373)  time: 4.1231  data: 0.0002  max mem: 54684
+[11:36:02.526362] Epoch: [2]  [3220/3229]  lr: 0.000005  grad_norm: 0.7683 (0.7707)  closs: 1.0178 (1.0372)  time: 4.0615  data: 0.0001  max mem: 54684
+[11:36:35.973354] Epoch: [2] Total time: 3:40:17
+[11:36:35.974297] Averaged stats: lr: 0.000005  grad_norm: 0.7420 (0.7708)  closs: 1.0624 (1.0381)
+[11:36:36.337101] model saved
+[11:36:38.023619] optimizer saved
+[11:36:38.024212] other rank-common saved
+[11:36:38.029179] rank-specific saved
+[11:36:38.029388] Training time 11:00:31
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..76baca48914c9ea8944b781bd72bdcc53122ffd6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0e65a2cbc92bd926b57acf3f986bccab80793cafe0e2e8b0f18566fafb58cc9
+size 90930987
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..60fcd5447400048b70bd12781a7ac2ff3f591896
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e474f0071285386afbebdb6c6fb925be5ef3e9f3349a22c36fbffe48d77ea7c9
+size 204320439
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75738418626b6e3173bda41a86ec5da7dc593ab8
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac78a8280454755c93a56b940b311201a0a8911b1b5f05c35d04486388b998fe
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..111c6c89cd3eab9b2e2e78b3c93b55e42b7179cc
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8fc318f143f893d3305b5abda9853ef6ba090d582b39122bfe0ef61ef2620d5
+size 90930987
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b4645edcecd610180b6946730ba37462aaf675f3
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2fdf6406551f7380e09fda1858fac263c8d47f335664c8f1f26058d416bb70e
+size 204320439
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..50bd574ffb5cad6148a8e51c9ab05e3d80027b40
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f168ff92d4b19bd4c714a344bfcdc36e1203ddcff5c9504a63acf3bdfb043b3
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d9e5d36c34d582a7b6e6b39a2d4a9186a4e338d4
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c07fd4364e9b806715c985cd4a4905c02bc2c67014227d4322effc78e87d42f4
+size 90930987
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..92472b6308ca7910f824edde08c94c6fc4473912
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b3ffb824593175223cc1cc760088a45e55bb33fcdd62e37e42249c03c9c9d36
+size 204320439
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d31ff745058632b1ad1e193fae6bdeb5d0a8a172
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d63f55f7f82591356144cd8ce6e7acba5e908efe4759350a3af15f31d4015567
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9dfdab397433ffafcd9aaaa95d61dea3ffa4e9b
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4092488fa46d10b7694923c74532faf866919d6c39c9a6d52bc4ab1e505bcb45
+size 90930987
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..42bf93bb5ab71a68c015eabd2d345ca309073368
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8abcacc8b53a2d27740c8543bb5adcb88e2ab89e9aa5cd403e30ed2d9563dc1c
+size 204320439
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8c07662de2e6d6147924de4c5427234211f7d032
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27afc0d923f1973f23206d7bbc02b246e988a3fe94eebdedc4e14b31fa3b5801
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63c991be766a6e7b09f62eb27253c47ba558cf0e
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt
@@ -0,0 +1,4 @@
+{"train_lr": 2.49692118226601e-05, "train_grad_norm": 1.03953114467595, "train_closs": 0.8988287092961849, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 1.03953114467595, "val_closs": 0.8988287092961849}
+{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.9151975991837497, "train_closs": 0.854513919164468, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.9151975991837497, "val_closs": 0.854513919164468}
+{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.8807328767670787, "train_closs": 0.8423879001418064, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.8807328767670787, "val_closs": 0.8423879001418064}
+{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.9092244758394551, "train_closs": 0.8364003172804937, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.9092244758394551, "val_closs": 0.8364003172804937}
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..3ce76611f6b1af197c27e3fedae6374f65ada327
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log
@@ -0,0 +1,648 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 6): env://, gpu 6
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 5): env://, gpu 5
+| distributed init (rank 7): env://, gpu 7
+| distributed init (rank 3): env://, gpu 3
+| distributed init (rank 4): env://, gpu 4
+| distributed init (rank 2): env://, gpu 2
+[05:56:44.962935] > initializing model parallel with size 1
+[05:56:44.963012] > initializing ddp with size 8
+[05:56:44.963019] > initializing pipeline with size 1
+[05:56:45.119503] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[05:56:45.119584] Namespace(batch_size=8,
+accum_iter=1,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-13b/params.json',
+'configs/model/finetune/sg/llamaPeft_normBiasLora.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-13b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B',
+log_dir='./output_dir',
+save_interval=1,
+only_save_trainable=True,
+device='cuda',
+seed=0,
+resume='',
+num_workers=24,
+pin_mem=True,
+world_size=8,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[05:56:45.120384] Start initialization.
+[05:56:45.120416] ## Processing on RANK 0.
+[05:56:45.129261] Model Args:
+ ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True)
+[05:58:19.701205] Model is Peft: True
+[05:58:19.709591] Trainable parameter count : 65131520 (local rank), 65131520 (all).
+[05:58:19.822258] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[05:58:52.688570] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:22, 45.94it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:29, 35.58it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:50, 20.28it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:14, 13.86it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:36, 10.56it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:54,  8.93it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:11, 14.13it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:08, 14.69it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:06, 15.19it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:04, 15.55it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:22, 12.17it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:41,  9.79it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:57,  8.49it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:12, 13.57it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:09, 14.20it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:06, 14.77it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:04, 15.27it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:20, 12.04it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:40,  9.70it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:54,  8.46it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:11, 13.48it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:07, 14.12it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:04, 14.75it/s]Qunatization Process:   9%|▉         | 95/1047 [00:07<01:02, 15.25it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:19, 11.99it/s]Qunatization Process:  10%|▉         | 102/1047 [00:08<01:37,  9.71it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:51,  8.46it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:09, 13.48it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:06, 14.10it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:09<01:03, 14.62it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<01:01, 15.14it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:16, 11.98it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:34,  9.70it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:48,  8.44it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:07, 13.48it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:11<01:04, 14.04it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:11<01:01, 14.65it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<00:59, 15.24it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:14, 12.01it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:30,  9.87it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:41,  8.73it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:03, 13.96it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:13<01:00, 14.61it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<00:57, 15.35it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:54, 15.94it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:08, 12.69it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:24, 10.27it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:14<01:36,  8.98it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:14<01:00, 14.27it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:15<00:57, 14.96it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<00:54, 15.59it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:52, 16.13it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:15<01:06, 12.76it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:22, 10.23it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:16<01:35,  8.77it/s]Qunatization Process:  21%|██        | 216/1047 [00:17<00:59, 13.95it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<00:56, 14.54it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:54, 15.14it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:17<00:52, 15.59it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:18<01:07, 12.21it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:18<01:22,  9.85it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:18<01:34,  8.58it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:19<00:59, 13.64it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:19<00:56, 14.28it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:19<00:53, 14.87it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:19<00:51, 15.44it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:20<01:05, 12.18it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:20<01:20,  9.86it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:21<01:31,  8.62it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:21<00:56, 13.70it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:21<00:53, 14.38it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:21<00:51, 15.01it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:21<00:49, 15.50it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:22<01:02, 12.19it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:22<01:17,  9.88it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:23<01:28,  8.62it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:23<00:54, 13.73it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:23<00:52, 14.24it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:23<00:50, 14.88it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:23<00:48, 15.45it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:24<01:01, 12.13it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:24<01:15,  9.76it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:25<01:25,  8.57it/s]Qunatization Process:  31%|███       | 320/1047 [00:25<00:53, 13.68it/s]Qunatization Process:  31%|███       | 323/1047 [00:25<00:51, 14.17it/s]Qunatization Process:  31%|███       | 326/1047 [00:25<00:48, 14.83it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:26<00:46, 15.40it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:26<00:59, 12.09it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:26<01:12,  9.85it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:27<01:22,  8.62it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:27<00:51, 13.59it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:27<00:48, 14.28it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:27<00:46, 14.93it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:28<00:44, 15.49it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:28<00:56, 12.13it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:29<01:09,  9.87it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:29<01:19,  8.53it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:29<00:49, 13.64it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:29<00:46, 14.32it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:30<00:44, 14.91it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:30<00:43, 15.47it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:30<00:54, 12.24it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:31<01:06,  9.90it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:31<01:16,  8.58it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:31<00:47, 13.70it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:31<00:44, 14.38it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:32<00:42, 14.96it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:32<00:41, 15.50it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:32<00:51, 12.25it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:33<01:04,  9.88it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:33<01:13,  8.61it/s]Qunatization Process:  40%|████      | 424/1047 [00:33<00:45, 13.73it/s]Qunatization Process:  41%|████      | 427/1047 [00:34<00:43, 14.37it/s]Qunatization Process:  41%|████      | 430/1047 [00:34<00:41, 15.00it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:34<00:39, 15.54it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:34<00:49, 12.23it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:35<01:01,  9.91it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:35<01:10,  8.56it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:36<00:43, 13.68it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:36<00:41, 14.36it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:36<00:39, 15.00it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:36<00:38, 15.30it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:37<00:47, 12.19it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:37<00:58,  9.85it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:37<01:07,  8.60it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:38<00:41, 13.72it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:38<00:39, 14.40it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:38<00:37, 15.03it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:38<00:36, 15.52it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:39<00:45, 12.29it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:39<00:56,  9.87it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:40<01:03,  8.63it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:40<00:39, 13.77it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:40<00:37, 14.44it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:40<00:35, 15.02it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:40<00:34, 15.56it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:41<00:43, 12.27it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:41<00:53,  9.84it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:42<01:01,  8.61it/s]Qunatization Process:  50%|█████     | 528/1047 [00:42<00:37, 13.74it/s]Qunatization Process:  51%|█████     | 531/1047 [00:42<00:36, 14.26it/s]Qunatization Process:  51%|█████     | 534/1047 [00:42<00:34, 14.91it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:42<00:32, 15.47it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:43<00:41, 12.12it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:43<00:51,  9.83it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:44<00:58,  8.52it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:44<00:36, 13.62it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:44<00:34, 14.31it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:44<00:32, 14.96it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:44<00:31, 15.51it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:45<00:39, 12.13it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:45<00:48,  9.83it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:46<00:55,  8.55it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:46<00:34, 13.66it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:46<00:32, 14.35it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:46<00:30, 14.93it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:47<00:29, 15.49it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:47<00:36, 12.27it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:48<00:45,  9.87it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:48<00:52,  8.61it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:48<00:32, 13.75it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:48<00:30, 14.42it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:49<00:28, 15.03it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:49<00:27, 15.51it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:49<00:34, 12.25it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:50<00:42,  9.89it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:50<00:48,  8.63it/s]Qunatization Process:  60%|██████    | 632/1047 [00:50<00:30, 13.73it/s]Qunatization Process:  61%|██████    | 635/1047 [00:50<00:28, 14.40it/s]Qunatization Process:  61%|██████    | 638/1047 [00:51<00:27, 15.02it/s]Qunatization Process:  61%|██████    | 641/1047 [00:51<00:26, 15.50it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:51<00:32, 12.28it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:52<00:40,  9.88it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:52<00:45,  8.62it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:52<00:28, 13.75it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:53<00:26, 14.38it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:53<00:25, 15.01it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:53<00:24, 15.55it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:53<00:30, 12.27it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:54<00:37,  9.87it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:54<00:42,  8.61it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:54<00:26, 13.75it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:55<00:24, 14.42it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:55<00:23, 14.98it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:55<00:22, 15.53it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:55<00:28, 12.22it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:56<00:35,  9.88it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:56<00:39,  8.62it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:57<00:24, 13.75it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:57<00:23, 14.42it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:57<00:22, 14.81it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:57<00:21, 15.39it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:58<00:26, 12.22it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:58<00:32,  9.78it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:59<00:37,  8.58it/s]Qunatization Process:  70%|███████   | 736/1047 [00:59<00:22, 13.70it/s]Qunatization Process:  71%|███████   | 739/1047 [00:59<00:21, 14.18it/s]Qunatization Process:  71%|███████   | 742/1047 [00:59<00:20, 14.85it/s]Qunatization Process:  71%|███████   | 745/1047 [00:59<00:19, 15.42it/s]Qunatization Process:  72%|███████▏  | 749/1047 [01:00<00:24, 12.24it/s]Qunatization Process:  72%|███████▏  | 752/1047 [01:00<00:30,  9.81it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:01<00:33,  8.60it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:01<00:20, 13.70it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:01<00:19, 14.38it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:01<00:18, 15.00it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:01<00:17, 15.54it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:02<00:22, 12.27it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:02<00:27,  9.94it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:03<00:30,  8.66it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:03<00:18, 13.80it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:03<00:17, 14.46it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:03<00:16, 15.07it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:03<00:16, 15.60it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:04<00:20, 12.18it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:04<00:24,  9.89it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:05<00:27,  8.63it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:05<00:16, 13.77it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:05<00:15, 14.44it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:05<00:15, 14.84it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:06<00:14, 15.41it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:06<00:18, 12.18it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:06<00:22,  9.81it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:07<00:24,  8.60it/s]Qunatization Process:  80%|████████  | 840/1047 [01:07<00:15, 13.72it/s]Qunatization Process:  81%|████████  | 843/1047 [01:07<00:14, 14.40it/s]Qunatization Process:  81%|████████  | 846/1047 [01:07<00:13, 14.86it/s]Qunatization Process:  81%|████████  | 849/1047 [01:08<00:12, 15.43it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:08<00:15, 12.18it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:09<00:19,  9.86it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:09<00:21,  8.61it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:09<00:13, 13.73it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:09<00:12, 14.36it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:10<00:11, 14.99it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:10<00:11, 15.54it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:10<00:13, 12.17it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:11<00:16,  9.89it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:11<00:18,  8.64it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:11<00:11, 13.79it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:11<00:10, 14.47it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:12<00:09, 15.04it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:12<00:09, 15.59it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:12<00:11, 12.33it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:13<00:13,  9.98it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:13<00:15,  8.73it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:13<00:09, 13.87it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:14<00:08, 14.55it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:14<00:08, 15.16it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:14<00:07, 15.64it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:14<00:09, 12.31it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:15<00:11, 10.00it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:15<00:12,  8.72it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:15<00:07, 13.89it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:16<00:06, 14.52it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:16<00:06, 15.13it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:16<00:06, 15.58it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:16<00:07, 12.31it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:17<00:08,  9.97it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:17<00:09,  8.71it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:18<00:05, 13.88it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:18<00:05, 14.55it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:18<00:04, 15.11it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:18<00:04, 15.65it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:19<00:05, 12.37it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:19<00:06, 10.00it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:19<00:06,  8.73it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:20<00:03, 13.88it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:20<00:03, 14.55it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:20<00:02, 15.16it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:20<00:02, 15.63it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:21<00:03, 12.40it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:21<00:03, 10.02it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:22<00:03,  8.73it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:22<00:01, 13.92it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:22<00:01, 14.38it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:22<00:01, 15.03it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:22<00:01, 15.56it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:23<00:00, 12.21it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:23<00:00,  9.94it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:24<00:00,  8.70it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:25<00:00,  6.94it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:25<00:00, 12.29it/s]
+[06:00:38.673810] ## Processing on RANK 1.
+[06:02:11.514461] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:02:36.833953] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:21, 49.13it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:27, 38.14it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:47, 21.84it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:08, 14.89it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:30, 11.31it/s]Qunatization Process:   3%|▎         | 27/1047 [00:01<01:46,  9.54it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:07, 15.11it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:04, 15.71it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:01, 16.25it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:00, 16.72it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:17, 12.94it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:36, 10.36it/s]Qunatization Process:   5%|▌         | 53/1047 [00:03<01:50,  9.03it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:08, 14.41it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:05, 15.12it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:02, 15.77it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<00:59, 16.32it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:15, 12.87it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:33, 10.38it/s]Qunatization Process:   8%|▊         | 79/1047 [00:05<01:47,  9.05it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:06, 14.43it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:03, 15.09it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:00, 15.74it/s]Qunatization Process:   9%|▉         | 95/1047 [00:06<00:58, 16.30it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:13, 12.85it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:31, 10.35it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:44,  9.01it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:05, 14.37it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:01, 15.08it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:08<00:59, 15.74it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:08<00:56, 16.31it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:11, 12.84it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:09<01:28, 10.38it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:09<01:41,  9.05it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:02, 14.43it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:10<01:00, 15.10it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:10<00:57, 15.75it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:10<00:55, 16.31it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:09, 12.88it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:11<01:25, 10.42it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:11<01:37,  9.08it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:00, 14.48it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:12<00:57, 15.18it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:12<00:55, 15.79it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:12<00:53, 16.35it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:07, 12.92it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:13<01:23, 10.38it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:13<01:35,  9.07it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:14<00:59, 14.45it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:14<00:56, 15.16it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:14<00:53, 15.81it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:14<00:51, 16.33it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:15<01:05, 12.94it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:15<01:20, 10.45it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:15<01:32,  9.11it/s]Qunatization Process:  21%|██        | 216/1047 [00:16<00:57, 14.52it/s]Qunatization Process:  21%|██        | 219/1047 [00:16<00:54, 15.22it/s]Qunatization Process:  21%|██        | 222/1047 [00:16<00:52, 15.83it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:16<00:50, 16.39it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:17<01:03, 12.93it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:17<01:18, 10.45it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:17<01:29,  9.12it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:18<00:55, 14.51it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:18<00:52, 15.21it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:18<00:50, 15.85it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:18<00:48, 16.36it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:19<01:01, 12.95it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:19<01:15, 10.46it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:19<01:26,  9.06it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:20<00:54, 14.42it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:20<00:51, 15.10it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:20<00:49, 15.76it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:20<00:47, 16.34it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:21<00:59, 12.91it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:21<01:13, 10.44it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:21<01:23,  9.10it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:22<00:51, 14.51it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:22<00:49, 15.21it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:22<00:47, 15.86it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:22<00:45, 16.35it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:23<00:57, 12.91it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:23<01:10, 10.45it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:23<01:20,  9.10it/s]Qunatization Process:  31%|███       | 320/1047 [00:24<00:50, 14.51it/s]Qunatization Process:  31%|███       | 323/1047 [00:24<00:47, 15.16it/s]Qunatization Process:  31%|███       | 326/1047 [00:24<00:45, 15.82it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:24<00:43, 16.39it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:25<00:55, 12.92it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:25<01:08, 10.43it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:25<01:17,  9.09it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:26<00:48, 14.49it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:26<00:45, 15.20it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:26<00:44, 15.79it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:26<00:42, 16.36it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:27<00:53, 12.90it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:27<01:05, 10.45it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:27<01:14,  9.09it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:28<00:46, 14.46it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:28<00:44, 15.17it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:28<00:42, 15.82it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:28<00:40, 16.39it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:29<00:51, 12.93it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:29<01:03, 10.43it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:29<01:12,  9.09it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:30<00:44, 14.49it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:30<00:42, 15.19it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:30<00:40, 15.78it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:30<00:39, 16.35it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:31<00:49, 12.90it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:31<01:00, 10.42it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:31<01:09,  9.10it/s]Qunatization Process:  40%|████      | 424/1047 [00:32<00:43, 14.47it/s]Qunatization Process:  41%|████      | 427/1047 [00:32<00:40, 15.18it/s]Qunatization Process:  41%|████      | 430/1047 [00:32<00:38, 15.83it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:32<00:37, 16.33it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:33<00:47, 12.94it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:33<00:58, 10.44it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:33<01:06,  9.05it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:34<00:41, 14.44it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:34<00:39, 15.10it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:34<00:37, 15.76it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:34<00:35, 16.34it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:35<00:45, 12.89it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:35<00:55, 10.42it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:35<01:03,  9.08it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:36<00:39, 14.48it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:36<00:37, 15.19it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:36<00:35, 15.83it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:36<00:34, 16.33it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:37<00:43, 12.88it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:37<00:53, 10.44it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:37<01:00,  9.09it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:38<00:37, 14.50it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:38<00:35, 15.16it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:38<00:34, 15.81it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:38<00:32, 16.38it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:38<00:41, 12.92it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:39<00:51, 10.22it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:39<00:59,  8.80it/s]Qunatization Process:  50%|█████     | 528/1047 [00:40<00:37, 13.99it/s]Qunatization Process:  51%|█████     | 531/1047 [00:40<00:35, 14.62it/s]Qunatization Process:  51%|█████     | 534/1047 [00:40<00:33, 15.14it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:40<00:32, 15.65it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:41<00:41, 12.30it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:41<00:50,  9.93it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:42<00:57,  8.65it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:42<00:35, 13.79it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:42<00:33, 14.46it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:42<00:32, 15.02it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:42<00:31, 15.56it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:43<00:39, 12.27it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:43<00:47,  9.94it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:44<00:55,  8.61it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:44<00:34, 13.71it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:44<00:32, 14.39it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:44<00:30, 15.01it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:44<00:29, 15.50it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:45<00:36, 12.29it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:45<00:45,  9.92it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:46<00:51,  8.64it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:46<00:32, 13.78it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:46<00:30, 14.41it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:46<00:28, 15.03it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:46<00:27, 15.56it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:47<00:34, 12.27it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:47<00:42,  9.91it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:48<00:48,  8.63it/s]Qunatization Process:  60%|██████    | 632/1047 [00:48<00:30, 13.76it/s]Qunatization Process:  61%|██████    | 635/1047 [00:48<00:28, 14.43it/s]Qunatization Process:  61%|██████    | 638/1047 [00:48<00:27, 15.00it/s]Qunatization Process:  61%|██████    | 641/1047 [00:49<00:26, 15.54it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:49<00:32, 12.26it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:49<00:40,  9.91it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:50<00:45,  8.64it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:50<00:28, 13.77it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:50<00:26, 14.44it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:50<00:25, 15.00it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:51<00:24, 15.54it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:51<00:30, 12.25it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:52<00:37,  9.91it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:52<00:42,  8.65it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:52<00:26, 13.79it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:52<00:24, 14.42it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:53<00:23, 15.03it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:53<00:22, 15.57it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:53<00:28, 12.27it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:54<00:35,  9.91it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:54<00:39,  8.63it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:54<00:24, 13.77it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:54<00:23, 14.39it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:55<00:22, 15.01it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:55<00:21, 15.55it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:55<00:26, 12.27it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:56<00:32,  9.90it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:56<00:37,  8.58it/s]Qunatization Process:  70%|███████   | 736/1047 [00:56<00:22, 13.70it/s]Qunatization Process:  71%|███████   | 739/1047 [00:57<00:21, 14.38it/s]Qunatization Process:  71%|███████   | 742/1047 [00:57<00:20, 14.95it/s]Qunatization Process:  71%|███████   | 745/1047 [00:57<00:19, 15.50it/s]Qunatization Process:  72%|███████▏  | 749/1047 [00:57<00:24, 12.24it/s]Qunatization Process:  72%|███████▏  | 752/1047 [00:58<00:29,  9.89it/s]Qunatization Process:  72%|███████▏  | 755/1047 [00:58<00:33,  8.64it/s]Qunatization Process:  73%|███████▎  | 762/1047 [00:58<00:20, 13.74it/s]Qunatization Process:  73%|███████▎  | 765/1047 [00:59<00:19, 14.41it/s]Qunatization Process:  73%|███████▎  | 768/1047 [00:59<00:18, 15.03it/s]Qunatization Process:  74%|███████▎  | 771/1047 [00:59<00:17, 15.51it/s]Qunatization Process:  74%|███████▍  | 775/1047 [00:59<00:22, 12.24it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:00<00:27,  9.92it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:00<00:30,  8.64it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:01<00:18, 13.79it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:01<00:17, 14.41it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:01<00:16, 15.03it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:01<00:16, 15.57it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:02<00:20, 12.27it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:02<00:24,  9.91it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:03<00:27,  8.64it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:03<00:16, 13.78it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:03<00:15, 14.45it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:03<00:15, 15.00it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:03<00:14, 15.54it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:04<00:17, 12.26it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:04<00:21,  9.89it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:05<00:24,  8.64it/s]Qunatization Process:  80%|████████  | 840/1047 [01:05<00:15, 13.74it/s]Qunatization Process:  81%|████████  | 843/1047 [01:05<00:14, 14.42it/s]Qunatization Process:  81%|████████  | 846/1047 [01:05<00:13, 15.04it/s]Qunatization Process:  81%|████████  | 849/1047 [01:05<00:12, 15.52it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:06<00:15, 12.26it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:06<00:19,  9.94it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:07<00:21,  8.69it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:07<00:13, 13.81it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:07<00:12, 14.47it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:07<00:11, 15.09it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:07<00:11, 15.63it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:08<00:13, 12.38it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:08<00:16, 10.03it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:09<00:18,  8.68it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:09<00:11, 13.84it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:09<00:10, 14.52it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:09<00:09, 15.11it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:09<00:09, 15.64it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:10<00:11, 12.40it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:10<00:13, 10.02it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:11<00:15,  8.67it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:11<00:09, 13.84it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:11<00:08, 14.51it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:11<00:08, 14.99it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:12<00:07, 15.56it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:12<00:09, 12.32it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:12<00:11, 10.00it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:13<00:12,  8.72it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:13<00:07, 13.90it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:13<00:06, 14.54it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:13<00:06, 15.15it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:14<00:05, 15.68it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:14<00:07, 12.39it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:15<00:08,  9.96it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:15<00:09,  8.69it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:15<00:05, 13.86it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:15<00:05, 14.52it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:16<00:04, 15.09it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:16<00:04, 15.63it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:16<00:05, 12.35it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:17<00:06,  9.99it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:17<00:06,  8.72it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:17<00:03, 13.87it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:17<00:03, 14.53it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:18<00:02, 15.15it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:18<00:02, 15.63it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:18<00:03, 12.35it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:19<00:03, 10.01it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:19<00:03,  8.72it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:19<00:01, 13.86it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:20<00:01, 14.54it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:20<00:01, 15.16it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:20<00:01, 15.69it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:20<00:00, 12.38it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:21<00:00, 10.01it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:21<00:00,  8.72it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:22<00:00,  7.03it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:22<00:00, 12.64it/s]
+[06:04:20.386966] ## Processing on RANK 2.
+[06:05:53.360425] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:06:18.806243] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:21, 49.14it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:27, 38.12it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:47, 21.84it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:08, 14.89it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:30, 11.32it/s]Qunatization Process:   3%|▎         | 27/1047 [00:01<01:46,  9.54it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:06, 15.12it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:04, 15.72it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:01, 16.27it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:00, 16.73it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:16, 13.04it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:35, 10.48it/s]Qunatization Process:   5%|▌         | 53/1047 [00:03<01:49,  9.10it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:08, 14.51it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:04, 15.16it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:02, 15.80it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<00:59, 16.36it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:15, 12.88it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:33, 10.40it/s]Qunatization Process:   8%|▊         | 79/1047 [00:05<01:47,  9.05it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:06, 14.42it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:03, 15.13it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:00, 15.77it/s]Qunatization Process:   9%|▉         | 95/1047 [00:06<00:58, 16.33it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:13, 12.87it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:30, 10.39it/s]Qunatization Process:  10%|█         | 105/1047 [00:07<01:44,  9.01it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:05, 14.38it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:01, 15.09it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:08<00:58, 15.75it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:08<00:56, 16.32it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:11, 12.87it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:09<01:28, 10.40it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:09<01:41,  9.04it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:03, 14.43it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:10<01:00, 15.09it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:10<00:57, 15.75it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:10<00:55, 16.32it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:09, 12.92it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:11<01:25, 10.45it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:11<01:38,  9.08it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:01, 14.47it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:12<00:57, 15.18it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:12<00:55, 15.84it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:12<00:53, 16.41it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:07, 12.97it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:13<01:22, 10.49it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:13<01:34,  9.14it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:14<00:58, 14.56it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:14<00:55, 15.26it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:14<00:53, 15.90it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:14<00:51, 16.46it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:15<01:06, 12.70it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:15<01:22, 10.14it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:16<01:35,  8.77it/s]Qunatization Process:  21%|██        | 216/1047 [00:16<00:59, 13.95it/s]Qunatization Process:  21%|██        | 219/1047 [00:16<00:56, 14.56it/s]Qunatization Process:  21%|██        | 222/1047 [00:16<00:54, 15.16it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:16<00:52, 15.68it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:17<01:06, 12.35it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:17<01:20, 10.12it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:18<01:31,  8.90it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:18<00:56, 14.24it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:18<00:53, 14.99it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:18<00:50, 15.68it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:18<00:48, 16.28it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:19<01:01, 12.92it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:19<01:15, 10.46it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:20<01:26,  9.11it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:20<00:53, 14.53it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:20<00:50, 15.24it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:20<00:48, 15.89it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:20<00:46, 16.45it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:21<00:59, 12.98it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:21<01:12, 10.49it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:22<01:23,  9.11it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:22<00:51, 14.52it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:22<00:49, 15.18it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:22<00:47, 15.84it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:22<00:45, 16.41it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:23<00:57, 12.96it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:23<01:10, 10.48it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:24<01:20,  9.14it/s]Qunatization Process:  31%|███       | 320/1047 [00:24<00:49, 14.57it/s]Qunatization Process:  31%|███       | 323/1047 [00:24<00:47, 15.26it/s]Qunatization Process:  31%|███       | 326/1047 [00:24<00:45, 15.90it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:24<00:43, 16.46it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:25<00:54, 12.99it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:25<01:07, 10.50it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:25<01:17,  9.15it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:26<00:48, 14.55it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:26<00:45, 15.25it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:26<00:43, 15.90it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:26<00:42, 16.46it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:27<00:52, 12.99it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:27<01:05, 10.50it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:27<01:14,  9.14it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:28<00:46, 14.50it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:28<00:44, 15.16it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:28<00:42, 15.83it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:28<00:40, 16.40it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:29<00:51, 12.96it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:29<01:02, 10.49it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:29<01:11,  9.14it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:30<00:44, 14.57it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:30<00:42, 15.23it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:30<00:40, 15.88it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:30<00:38, 16.44it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:31<00:49, 12.98it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:31<01:00, 10.49it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:31<01:08,  9.14it/s]Qunatization Process:  40%|████      | 424/1047 [00:32<00:42, 14.57it/s]Qunatization Process:  41%|████      | 427/1047 [00:32<00:40, 15.18it/s]Qunatization Process:  41%|████      | 430/1047 [00:32<00:38, 15.84it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:32<00:37, 16.41it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:33<00:47, 12.97it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:33<00:57, 10.49it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:33<01:06,  9.14it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:34<00:40, 14.57it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:34<00:38, 15.27it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:34<00:37, 15.85it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:34<00:35, 16.42it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:35<00:45, 12.97it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:35<00:55, 10.49it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:35<01:03,  9.14it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:36<00:39, 14.58it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:36<00:37, 15.27it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:36<00:35, 15.80it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:36<00:34, 16.38it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:37<00:43, 12.96it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:37<00:52, 10.48it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:37<01:00,  9.14it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:38<00:37, 14.56it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:38<00:35, 15.26it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:38<00:34, 15.79it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:38<00:32, 16.37it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:38<00:41, 12.95it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:39<00:50, 10.48it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:39<00:57,  9.14it/s]Qunatization Process:  50%|█████     | 528/1047 [00:40<00:35, 14.57it/s]Qunatization Process:  51%|█████     | 531/1047 [00:40<00:33, 15.27it/s]Qunatization Process:  51%|█████     | 534/1047 [00:40<00:32, 15.85it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:40<00:31, 16.26it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:40<00:39, 12.91it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:41<00:48, 10.45it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:41<00:54,  9.12it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:42<00:33, 14.54it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:42<00:32, 15.25it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:42<00:30, 15.89it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:42<00:29, 16.34it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:42<00:37, 12.92it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:43<00:45, 10.47it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:43<00:51,  9.13it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:44<00:32, 14.56it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:44<00:30, 15.25it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:44<00:29, 15.90it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:44<00:27, 16.45it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:44<00:34, 12.98it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:45<00:42, 10.50it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:45<00:48,  9.15it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:45<00:30, 14.58it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:46<00:28, 15.28it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:46<00:27, 15.86it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:46<00:26, 16.38it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:46<00:33, 12.84it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:47<00:40, 10.42it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:47<00:46,  9.10it/s]Qunatization Process:  60%|██████    | 632/1047 [00:47<00:28, 14.52it/s]Qunatization Process:  61%|██████    | 635/1047 [00:48<00:27, 15.22it/s]Qunatization Process:  61%|██████    | 638/1047 [00:48<00:25, 15.87it/s]Qunatization Process:  61%|██████    | 641/1047 [00:48<00:24, 16.43it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:48<00:31, 12.95it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:49<00:38, 10.48it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:49<00:43,  9.13it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:49<00:26, 14.56it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:50<00:25, 15.25it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:50<00:24, 15.89it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:50<00:23, 16.45it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:50<00:28, 12.98it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:51<00:35, 10.49it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:51<00:40,  9.14it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:51<00:24, 14.57it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:52<00:23, 15.27it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:52<00:22, 15.91it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:52<00:21, 16.39it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:52<00:27, 12.88it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:53<00:33, 10.44it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:53<00:37,  9.11it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:53<00:23, 14.53it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:54<00:21, 15.23it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:54<00:20, 15.88it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:54<00:19, 16.44it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:54<00:24, 12.97it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:55<00:30, 10.49it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:55<00:34,  9.14it/s]Qunatization Process:  70%|███████   | 736/1047 [00:55<00:21, 14.58it/s]Qunatization Process:  71%|███████   | 739/1047 [00:56<00:20, 15.28it/s]Qunatization Process:  71%|███████   | 742/1047 [00:56<00:19, 15.92it/s]Qunatization Process:  71%|███████   | 745/1047 [00:56<00:18, 16.47it/s]Qunatization Process:  72%|███████▏  | 749/1047 [00:56<00:23, 12.88it/s]Qunatization Process:  72%|███████▏  | 752/1047 [00:57<00:28, 10.44it/s]Qunatization Process:  72%|███████▏  | 755/1047 [00:57<00:32,  9.12it/s]Qunatization Process:  73%|███████▎  | 762/1047 [00:57<00:19, 14.53it/s]Qunatization Process:  73%|███████▎  | 765/1047 [00:58<00:18, 15.23it/s]Qunatization Process:  73%|███████▎  | 768/1047 [00:58<00:17, 15.87it/s]Qunatization Process:  74%|███████▎  | 771/1047 [00:58<00:16, 16.36it/s]Qunatization Process:  74%|███████▍  | 775/1047 [00:58<00:21, 12.86it/s]Qunatization Process:  74%|███████▍  | 778/1047 [00:59<00:25, 10.42it/s]Qunatization Process:  75%|███████▍  | 781/1047 [00:59<00:29,  9.11it/s]Qunatization Process:  75%|███████▌  | 788/1047 [00:59<00:17, 14.52it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:00<00:16, 15.23it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:00<00:15, 15.88it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:00<00:15, 16.44it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:00<00:18, 12.98it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:01<00:23, 10.47it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:01<00:26,  9.14it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:01<00:15, 14.57it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:02<00:15, 15.27it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:02<00:14, 15.91it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:02<00:13, 16.47it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:02<00:16, 12.99it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:03<00:20, 10.41it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:03<00:23,  9.10it/s]Qunatization Process:  80%|████████  | 840/1047 [01:03<00:14, 14.52it/s]Qunatization Process:  81%|████████  | 843/1047 [01:04<00:13, 15.23it/s]Qunatization Process:  81%|████████  | 846/1047 [01:04<00:12, 15.88it/s]Qunatization Process:  81%|████████  | 849/1047 [01:04<00:12, 16.37it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:04<00:14, 12.95it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:05<00:18, 10.41it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:05<00:20,  9.11it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:05<00:12, 14.53it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:06<00:11, 15.24it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:06<00:11, 15.89it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:06<00:10, 16.47it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:06<00:12, 13.05it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:07<00:15, 10.46it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:07<00:17,  9.15it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:07<00:10, 14.60it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:07<00:09, 15.30it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:08<00:09, 15.95it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:08<00:08, 16.51it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:08<00:10, 13.06it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:09<00:13, 10.51it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:09<00:14,  9.11it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:09<00:08, 14.54it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:09<00:08, 15.25it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:10<00:07, 15.91it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:10<00:07, 16.48it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:10<00:08, 13.01it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:11<00:10, 10.50it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:11<00:12,  9.15it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:11<00:07, 14.59it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:11<00:06, 15.30it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:12<00:06, 15.94it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:12<00:05, 16.51it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:12<00:06, 13.06it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:13<00:08, 10.57it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:13<00:09,  9.18it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:13<00:05, 14.64it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:13<00:04, 15.34it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:14<00:04, 15.99it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:14<00:04, 16.54it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:14<00:04, 13.08it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:15<00:05, 10.58it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:15<00:06,  9.22it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:15<00:03, 14.69it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:15<00:03, 15.38it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:16<00:02, 16.02it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:16<00:02, 16.57it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:16<00:02, 13.05it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:17<00:03, 10.56it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:17<00:03,  9.13it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:17<00:01, 14.56it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:17<00:01, 15.27it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:18<00:01, 15.92it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:18<00:00, 16.49it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:18<00:00, 13.05it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:19<00:00, 10.57it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:19<00:00,  9.17it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:20<00:00,  7.43it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:20<00:00, 13.00it/s]
+[06:07:56.901441] ## Processing on RANK 3.
+[06:09:31.153764] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:10:03.140892] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:25, 40.12it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:30, 33.71it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:51, 20.06it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:13, 13.87it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:36, 10.60it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:53,  8.98it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:11, 14.25it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:08, 14.84it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:05, 15.33it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:03, 15.76it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:21, 12.28it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:40,  9.88it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:55,  8.57it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:12, 13.68it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:08, 14.33it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:05, 14.91it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:03, 15.40it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:20, 12.13it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:39,  9.78it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:53,  8.52it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:10, 13.57it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:07, 14.26it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:04, 14.90it/s]Qunatization Process:   9%|▉         | 95/1047 [00:07<01:01, 15.39it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:18, 12.09it/s]Qunatization Process:  10%|▉         | 102/1047 [00:08<01:36,  9.77it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:50,  8.51it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:08, 13.57it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:05, 14.21it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:09<01:02, 14.80it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<01:00, 15.37it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:16, 12.08it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:34,  9.77it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:47,  8.51it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:07, 13.56it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:10<01:03, 14.21it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:11<01:00, 14.81it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<00:58, 15.33it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:14, 12.11it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:31,  9.80it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:44,  8.55it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:04, 13.62it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:13<01:01, 14.28it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<00:58, 14.87it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:56, 15.39it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:11, 12.19it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:27,  9.86it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:14<01:40,  8.59it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:15<01:02, 13.72it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:15<01:00, 14.21it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<00:57, 14.88it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:54, 15.46it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:16<01:09, 12.16it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:25,  9.86it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:16<01:37,  8.60it/s]Qunatization Process:  21%|██        | 216/1047 [00:17<01:00, 13.69it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<00:57, 14.34it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:55, 14.93it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:17<00:53, 15.44it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:18<01:07, 12.20it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:18<01:22,  9.87it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:19<01:34,  8.61it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:19<00:58, 13.73it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:19<00:55, 14.37it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:19<00:53, 14.98it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:19<00:51, 15.54it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:20<01:04, 12.32it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:20<01:19,  9.98it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:21<01:30,  8.64it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:21<00:56, 13.78it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:21<00:53, 14.45it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:21<00:51, 15.05it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:21<00:49, 15.59it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:22<01:02, 12.31it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:22<01:16,  9.96it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:23<01:28,  8.63it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:23<00:54, 13.77it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:23<00:52, 14.30it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:23<00:49, 14.95it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:23<00:47, 15.51it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:24<01:00, 12.16it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:24<01:15,  9.79it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:25<01:25,  8.60it/s]Qunatization Process:  31%|███       | 320/1047 [00:25<00:52, 13.73it/s]Qunatization Process:  31%|███       | 323/1047 [00:25<00:50, 14.22it/s]Qunatization Process:  31%|███       | 326/1047 [00:25<00:48, 14.88it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:26<00:46, 15.46it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:26<00:58, 12.12it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:27<01:11,  9.88it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:27<01:22,  8.56it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:27<00:51, 13.69it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:27<00:48, 14.38it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:28<00:46, 15.02it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:28<00:45, 15.31it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:28<00:56, 12.21it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:29<01:09,  9.90it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:29<01:18,  8.65it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:29<00:49, 13.76it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:29<00:46, 14.44it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:30<00:44, 15.01it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:30<00:42, 15.57it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:30<00:53, 12.30it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:31<01:06,  9.95it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:31<01:15,  8.67it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:31<00:47, 13.78it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:32<00:44, 14.46it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:32<00:42, 15.03it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:32<00:41, 15.58it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:32<00:51, 12.30it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:33<01:03,  9.95it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:33<01:12,  8.65it/s]Qunatization Process:  40%|████      | 424/1047 [00:33<00:45, 13.80it/s]Qunatization Process:  41%|████      | 427/1047 [00:34<00:42, 14.48it/s]Qunatization Process:  41%|████      | 430/1047 [00:34<00:40, 15.05it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:34<00:39, 15.60it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:34<00:49, 12.32it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:35<01:01,  9.86it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:35<01:10,  8.55it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:36<00:43, 13.67it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:36<00:41, 14.36it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:36<00:39, 15.00it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:36<00:37, 15.50it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:37<00:47, 12.29it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:37<00:58,  9.96it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:37<01:06,  8.69it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:38<00:41, 13.83it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:38<00:39, 14.50it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:38<00:37, 15.11it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:38<00:36, 15.49it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:39<00:45, 12.30it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:39<00:55,  9.96it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:40<01:03,  8.69it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:40<00:39, 13.84it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:40<00:37, 14.51it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:40<00:36, 14.91it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:40<00:34, 15.48it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:41<00:43, 12.29it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:41<00:53,  9.88it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:42<01:00,  8.65it/s]Qunatization Process:  50%|█████     | 528/1047 [00:42<00:37, 13.80it/s]Qunatization Process:  51%|█████     | 531/1047 [00:42<00:35, 14.46it/s]Qunatization Process:  51%|█████     | 534/1047 [00:42<00:34, 14.87it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:42<00:33, 15.44it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:43<00:41, 12.28it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:43<00:51,  9.85it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:44<00:57,  8.63it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:44<00:36, 13.62it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:44<00:34, 14.32it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:44<00:32, 14.96it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:44<00:31, 15.52it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:45<00:39, 12.16it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:45<00:48,  9.90it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:46<00:54,  8.64it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:46<00:33, 13.78it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:46<00:32, 14.45it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:46<00:30, 15.07it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:47<00:29, 15.60it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:47<00:36, 12.30it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:47<00:45,  9.94it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:48<00:51,  8.68it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:48<00:31, 13.80it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:48<00:30, 14.47it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:48<00:28, 15.08it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:49<00:27, 15.55it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:49<00:34, 12.28it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:50<00:42,  9.96it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:50<00:48,  8.67it/s]Qunatization Process:  60%|██████    | 632/1047 [00:50<00:30, 13.79it/s]Qunatization Process:  61%|██████    | 635/1047 [00:50<00:28, 14.46it/s]Qunatization Process:  61%|██████    | 638/1047 [00:51<00:27, 15.08it/s]Qunatization Process:  61%|██████    | 641/1047 [00:51<00:26, 15.61it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:51<00:32, 12.31it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:52<00:40,  9.95it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:52<00:45,  8.67it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:52<00:28, 13.81it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:52<00:26, 14.44it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:53<00:25, 15.04it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:53<00:24, 15.58it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:53<00:30, 12.30it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:54<00:37,  9.94it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:54<00:42,  8.66it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:54<00:26, 13.81it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:55<00:24, 14.47it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:55<00:23, 15.03it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:55<00:22, 15.57it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:55<00:28, 12.29it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:56<00:34,  9.94it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:56<00:39,  8.68it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:56<00:24, 13.67it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:57<00:23, 14.35it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:57<00:22, 14.98it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:57<00:21, 15.52it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:57<00:26, 12.15it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:58<00:32,  9.90it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:58<00:37,  8.56it/s]Qunatization Process:  70%|███████   | 736/1047 [00:59<00:22, 13.68it/s]Qunatization Process:  71%|███████   | 739/1047 [00:59<00:21, 14.36it/s]Qunatization Process:  71%|███████   | 742/1047 [00:59<00:20, 14.99it/s]Qunatization Process:  71%|███████   | 745/1047 [00:59<00:19, 15.54it/s]Qunatization Process:  72%|███████▏  | 749/1047 [01:00<00:24, 12.17it/s]Qunatization Process:  72%|███████▏  | 752/1047 [01:00<00:29,  9.88it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:01<00:33,  8.65it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:01<00:20, 13.79it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:01<00:19, 14.46it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:01<00:18, 15.08it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:01<00:17, 15.56it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:02<00:22, 12.32it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:02<00:26,  9.98it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:03<00:30,  8.65it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:03<00:18, 13.79it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:03<00:17, 14.46it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:03<00:16, 15.07it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:03<00:16, 15.60it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:04<00:19, 12.32it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:04<00:24,  9.94it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:05<00:27,  8.61it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:05<00:16, 13.74it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:05<00:15, 14.41it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:05<00:15, 15.03it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:05<00:14, 15.36it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:06<00:17, 12.24it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:06<00:22,  9.84it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:07<00:24,  8.62it/s]Qunatization Process:  80%|████████  | 840/1047 [01:07<00:15, 13.72it/s]Qunatization Process:  81%|████████  | 843/1047 [01:07<00:14, 14.40it/s]Qunatization Process:  81%|████████  | 846/1047 [01:07<00:13, 15.02it/s]Qunatization Process:  81%|████████  | 849/1047 [01:08<00:12, 15.56it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:08<00:15, 12.27it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:08<00:19,  9.93it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:09<00:21,  8.67it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:09<00:13, 13.83it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:09<00:12, 14.45it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:09<00:11, 15.08it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:10<00:11, 15.62it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:10<00:13, 12.41it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:11<00:16, 10.03it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:11<00:18,  8.72it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:11<00:11, 13.87it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:11<00:10, 14.54it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:11<00:09, 15.10it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:12<00:09, 15.59it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:12<00:11, 12.21it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:13<00:14,  9.86it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:13<00:15,  8.55it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:13<00:09, 13.67it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:13<00:08, 14.14it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:14<00:08, 14.82it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:14<00:07, 15.40it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:14<00:09, 12.15it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:15<00:11,  9.80it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:15<00:12,  8.52it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:15<00:07, 13.63it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:16<00:07, 14.14it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:16<00:06, 14.82it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:16<00:06, 15.19it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:16<00:07, 12.16it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:17<00:08,  9.93it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:17<00:09,  8.68it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:17<00:05, 13.84it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:18<00:05, 14.51it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:18<00:04, 15.13it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:18<00:04, 15.62it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:18<00:05, 12.32it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:19<00:06,  9.99it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:19<00:06,  8.70it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:20<00:03, 13.74it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:20<00:03, 14.43it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:20<00:02, 15.06it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:20<00:02, 15.61it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:21<00:03, 12.33it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:21<00:03,  9.96it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:21<00:03,  8.70it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:22<00:01, 13.87it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:22<00:01, 14.55it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:22<00:01, 15.12it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:22<00:01, 15.67it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:23<00:00, 12.36it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:23<00:00,  9.93it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:24<00:00,  8.67it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:25<00:00,  6.96it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:25<00:00, 12.30it/s]
+[06:11:49.800164] ## Processing on RANK 4.
+[06:13:23.678323] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:13:53.605093] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:21, 47.64it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:27, 37.16it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:48, 21.28it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:11, 14.41it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:33, 10.93it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:50,  9.20it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:09, 14.60it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:06, 15.21it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:04, 15.71it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:02, 16.19it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:19, 12.59it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:40,  9.94it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:56,  8.50it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:12, 13.53it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:09, 14.15it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:06, 14.71it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:04, 15.17it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:22, 11.84it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:42,  9.48it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:56,  8.34it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:11, 13.41it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:07, 14.18it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:04, 14.89it/s]Qunatization Process:   9%|▉         | 95/1047 [00:07<01:01, 15.53it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:17, 12.31it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:34,  9.95it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:48,  8.67it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:07, 13.85it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:04, 14.52it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:08<01:01, 15.17it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<00:58, 15.74it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:14, 12.40it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:31,  9.99it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:45,  8.70it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:05, 13.90it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:10<01:02, 14.59it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:11<00:59, 15.23it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<00:57, 15.78it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:12, 12.40it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:29,  9.98it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:42,  8.71it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:03, 13.91it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:12<01:00, 14.60it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<00:57, 15.23it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:55, 15.79it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:09, 12.48it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:26, 10.07it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:14<01:38,  8.78it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:14<01:01, 14.00it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:14<00:58, 14.68it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<00:55, 15.26it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:53, 15.81it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:15<01:07, 12.47it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:23, 10.07it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:16<01:35,  8.78it/s]Qunatization Process:  21%|██        | 216/1047 [00:16<00:59, 14.00it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<00:56, 14.69it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:53, 15.32it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:17<00:51, 15.86it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:17<01:05, 12.50it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:18<01:20, 10.07it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:18<01:32,  8.78it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:18<00:57, 13.99it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:19<00:54, 14.67it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:19<00:52, 15.30it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:19<00:50, 15.84it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:19<01:03, 12.48it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:20<01:18, 10.07it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:20<01:29,  8.77it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:20<00:55, 13.98it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:21<00:52, 14.65it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:21<00:50, 15.26it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:21<00:48, 15.76it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:21<01:01, 12.41it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:22<01:16, 10.02it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:22<01:27,  8.73it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:23<00:54, 13.91it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:23<00:51, 14.57it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:23<00:49, 15.18it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:23<00:47, 15.72it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:24<00:59, 12.39it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:24<01:13, 10.02it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:24<01:23,  8.81it/s]Qunatization Process:  31%|███       | 320/1047 [00:25<00:51, 14.09it/s]Qunatization Process:  31%|███       | 323/1047 [00:25<00:48, 14.80it/s]Qunatization Process:  31%|███       | 326/1047 [00:25<00:46, 15.50it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:25<00:44, 16.11it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:26<00:56, 12.73it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:26<01:08, 10.36it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:26<01:18,  9.04it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:27<00:48, 14.42it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:27<00:46, 15.12it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:27<00:44, 15.77it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:27<00:42, 16.30it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:28<00:53, 12.85it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:28<01:06, 10.35it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:28<01:15,  9.00it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:29<00:47, 14.34it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:29<00:44, 14.98it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:29<00:42, 15.61it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:29<00:41, 16.11it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:30<00:52, 12.73it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:30<01:03, 10.36it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:30<01:12,  8.99it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:31<00:45, 14.24it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:31<00:43, 14.89it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:31<00:41, 15.51it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:31<00:40, 16.00it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:32<00:50, 12.60it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:32<01:02, 10.19it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:33<01:11,  8.85it/s]Qunatization Process:  40%|████      | 424/1047 [00:33<00:44, 14.16it/s]Qunatization Process:  41%|████      | 427/1047 [00:33<00:41, 14.81it/s]Qunatization Process:  41%|████      | 430/1047 [00:33<00:39, 15.51it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:33<00:38, 16.07it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:34<00:48, 12.61it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:34<00:59, 10.20it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:35<01:08,  8.87it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:35<00:42, 14.12it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:35<00:40, 14.81it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:35<00:38, 15.24it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:35<00:37, 15.60it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:36<00:48, 12.16it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:36<00:59,  9.78it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:37<01:08,  8.47it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:37<00:42, 13.48it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:37<00:40, 14.15it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:37<00:38, 14.68it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:37<00:36, 15.21it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:38<00:46, 11.96it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:38<00:57,  9.67it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:39<01:05,  8.42it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:39<00:40, 13.44it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:39<00:38, 14.08it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:39<00:36, 14.72it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:40<00:35, 15.30it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:40<00:43, 12.18it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:40<00:53,  9.86it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:41<01:01,  8.57it/s]Qunatization Process:  50%|█████     | 528/1047 [00:41<00:37, 13.66it/s]Qunatization Process:  51%|█████     | 531/1047 [00:41<00:36, 14.16it/s]Qunatization Process:  51%|█████     | 534/1047 [00:41<00:34, 14.82it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:42<00:33, 15.39it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:42<00:41, 12.09it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:43<00:51,  9.74it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:43<00:59,  8.46it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:43<00:36, 13.37it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:43<00:34, 14.09it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:44<00:32, 14.76it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:44<00:32, 15.08it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:44<00:39, 12.08it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:45<00:48,  9.76it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:45<00:55,  8.51it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:45<00:34, 13.59it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:46<00:32, 14.24it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:46<00:31, 14.83it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:46<00:29, 15.40it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:46<00:37, 12.15it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:47<00:46,  9.76it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:47<00:52,  8.47it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:48<00:32, 13.55it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:48<00:31, 14.06it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:48<00:29, 14.72it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:48<00:28, 15.32it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:49<00:35, 12.02it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:49<00:43,  9.71it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:49<00:49,  8.51it/s]Qunatization Process:  60%|██████    | 632/1047 [00:50<00:30, 13.59it/s]Qunatization Process:  61%|██████    | 635/1047 [00:50<00:28, 14.28it/s]Qunatization Process:  61%|██████    | 638/1047 [00:50<00:27, 14.92it/s]Qunatization Process:  61%|██████    | 641/1047 [00:50<00:26, 15.48it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:51<00:32, 12.25it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:51<00:40,  9.90it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:52<00:46,  8.59it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:52<00:28, 13.70it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:52<00:26, 14.37it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:52<00:25, 14.98it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:52<00:24, 15.47it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:53<00:30, 12.22it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:53<00:37,  9.88it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:54<00:43,  8.56it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:54<00:26, 13.66it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:54<00:25, 14.35it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:54<00:24, 14.78it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:54<00:23, 15.35it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:55<00:29, 12.07it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:55<00:35,  9.82it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:56<00:40,  8.52it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:56<00:24, 13.62it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:56<00:23, 14.31it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:56<00:22, 14.88it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:57<00:21, 15.44it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:57<00:26, 12.21it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:57<00:32,  9.83it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:58<00:37,  8.58it/s]Qunatization Process:  70%|███████   | 736/1047 [00:58<00:22, 13.70it/s]Qunatization Process:  71%|███████   | 739/1047 [00:58<00:21, 14.32it/s]Qunatization Process:  71%|███████   | 742/1047 [00:58<00:20, 14.94it/s]Qunatization Process:  71%|███████   | 745/1047 [00:59<00:19, 15.49it/s]Qunatization Process:  72%|███████▏  | 749/1047 [00:59<00:24, 12.20it/s]Qunatization Process:  72%|███████▏  | 752/1047 [01:00<00:29,  9.86it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:00<00:33,  8.60it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:00<00:20, 13.73it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:00<00:19, 14.33it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:01<00:18, 14.96it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:01<00:17, 15.44it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:01<00:22, 12.21it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:02<00:27,  9.87it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:02<00:31,  8.54it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:02<00:19, 13.63it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:03<00:17, 14.30it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:03<00:16, 14.92it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:03<00:16, 15.22it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:03<00:20, 12.14it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:04<00:24,  9.75it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:04<00:28,  8.55it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:04<00:17, 13.48it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:05<00:16, 14.18it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:05<00:15, 14.83it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:05<00:14, 15.40it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:05<00:18, 12.04it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:06<00:22,  9.70it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:06<00:25,  8.52it/s]Qunatization Process:  80%|████████  | 840/1047 [01:07<00:15, 13.61it/s]Qunatization Process:  81%|████████  | 843/1047 [01:07<00:14, 14.24it/s]Qunatization Process:  81%|████████  | 846/1047 [01:07<00:13, 14.88it/s]Qunatization Process:  81%|████████  | 849/1047 [01:07<00:12, 15.41it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:08<00:15, 12.17it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:08<00:19,  9.85it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:09<00:21,  8.62it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:09<00:13, 13.76it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:09<00:12, 14.32it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:09<00:11, 14.96it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:09<00:11, 15.52it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:10<00:13, 12.32it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:10<00:16,  9.99it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:11<00:18,  8.71it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:11<00:11, 13.88it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:11<00:10, 14.37it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:11<00:09, 15.00it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:11<00:09, 15.55it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:12<00:11, 12.22it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:12<00:13,  9.93it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:13<00:15,  8.67it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:13<00:09, 13.82it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:13<00:08, 14.46it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:13<00:08, 15.09it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:13<00:07, 15.59it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:14<00:09, 12.29it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:14<00:11,  9.95it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:15<00:12,  8.64it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:15<00:07, 13.78it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:15<00:06, 14.45it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:15<00:06, 15.00it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:15<00:06, 15.51it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:16<00:07, 12.32it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:16<00:08,  9.95it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:17<00:09,  8.68it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:17<00:05, 13.83it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:17<00:05, 14.49it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:17<00:04, 15.08it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:18<00:04, 15.55it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:18<00:05, 12.28it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:18<00:06,  9.96it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:19<00:06,  8.69it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:19<00:03, 13.80it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:19<00:03, 14.46it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:19<00:02, 15.06it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:20<00:02, 15.51it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:20<00:03, 12.24it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:21<00:03,  9.92it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:21<00:03,  8.67it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:21<00:01, 13.82it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:21<00:01, 14.43it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:22<00:01, 15.03it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:22<00:01, 15.56it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:22<00:00, 12.29it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:23<00:00,  9.92it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:23<00:00,  8.60it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:24<00:00,  6.91it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:24<00:00, 12.35it/s]
+[06:15:39.994576] ## Processing on RANK 5.
+[06:17:13.870508] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:17:43.246978] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:21, 47.37it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:27, 36.96it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:48, 21.16it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:11, 14.36it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:33, 10.89it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:51,  9.17it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:09, 14.54it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:06, 15.14it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:04, 15.67it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:02, 16.13it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:19, 12.54it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:39, 10.06it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:53,  8.73it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:10, 13.93it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:07, 14.62it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:04, 15.19it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:02, 15.75it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:18, 12.38it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:37,  9.99it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:51,  8.70it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:09, 13.89it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:05, 14.58it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:02, 15.22it/s]Qunatization Process:   9%|▉         | 95/1047 [00:06<01:00, 15.77it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:16, 12.35it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:36,  9.76it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:52,  8.39it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:10, 13.35it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:06, 13.97it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:08<01:03, 14.54it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<01:01, 15.03it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:16, 12.03it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:33,  9.82it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:46,  8.59it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:06, 13.71it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:10<01:02, 14.42it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:10<00:59, 15.08it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<00:57, 15.65it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:12, 12.34it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:29,  9.98it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:42,  8.69it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:12<01:03, 13.87it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:12<01:00, 14.56it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<00:57, 15.18it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:55, 15.74it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:13<01:09, 12.44it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:26, 10.06it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:14<01:38,  8.76it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:14<01:01, 13.96it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:14<00:58, 14.64it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<00:55, 15.27it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:53, 15.80it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:15<01:07, 12.45it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:23, 10.06it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:16<01:35,  8.74it/s]Qunatization Process:  21%|██        | 216/1047 [00:16<00:59, 13.93it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<00:56, 14.61it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:54, 15.23it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:17<00:52, 15.77it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:17<01:05, 12.45it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:18<01:21, 10.06it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:18<01:33,  8.69it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:18<00:58, 13.85it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:19<00:55, 14.53it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:19<00:52, 15.15it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:19<00:50, 15.70it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:19<01:03, 12.41it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:20<01:18, 10.03it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:20<01:29,  8.74it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:21<00:55, 13.94it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:21<00:53, 14.62it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:21<00:50, 15.23it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:21<00:48, 15.77it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:21<01:01, 12.44it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:22<01:16, 10.04it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:22<01:27,  8.73it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:23<00:54, 13.92it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:23<00:51, 14.59it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:23<00:49, 15.21it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:23<00:47, 15.75it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:24<00:59, 12.43it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:24<01:13, 10.03it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:24<01:23,  8.74it/s]Qunatization Process:  31%|███       | 320/1047 [00:25<00:52, 13.93it/s]Qunatization Process:  31%|███       | 323/1047 [00:25<00:49, 14.60it/s]Qunatization Process:  31%|███       | 326/1047 [00:25<00:47, 15.21it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:25<00:45, 15.74it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:26<00:57, 12.42it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:26<01:10, 10.03it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:27<01:20,  8.74it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:27<00:50, 13.93it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:27<00:47, 14.59it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:27<00:45, 15.21it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:27<00:43, 15.73it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:28<00:55, 12.41it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:28<01:08, 10.00it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:29<01:18,  8.72it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:29<00:48, 13.90it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:29<00:46, 14.56it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:29<00:44, 15.17it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:29<00:42, 15.69it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:30<00:53, 12.39it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:30<01:05, 10.01it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:31<01:15,  8.73it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:31<00:46, 13.90it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:31<00:44, 14.57it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:31<00:42, 15.18it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:31<00:40, 15.71it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:32<00:51, 12.40it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:32<01:03,  9.96it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:33<01:12,  8.67it/s]Qunatization Process:  40%|████      | 424/1047 [00:33<00:45, 13.83it/s]Qunatization Process:  41%|████      | 427/1047 [00:33<00:42, 14.50it/s]Qunatization Process:  41%|████      | 430/1047 [00:33<00:40, 15.11it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:33<00:39, 15.64it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:34<00:49, 12.37it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:34<01:01,  9.91it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:35<01:09,  8.66it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:35<00:43, 13.82it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:35<00:40, 14.49it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:35<00:39, 15.11it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:36<00:37, 15.64it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:36<00:47, 12.37it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:36<00:58, 10.00it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:37<01:06,  8.72it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:37<00:41, 13.89it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:37<00:39, 14.55it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:37<00:37, 15.16it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:38<00:35, 15.69it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:38<00:45, 12.39it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:39<00:55,  9.93it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:39<01:03,  8.68it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:39<00:39, 13.84it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:39<00:37, 14.51it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:40<00:35, 15.12it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:40<00:34, 15.65it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:40<00:43, 12.33it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:41<00:53,  9.98it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:41<01:00,  8.71it/s]Qunatization Process:  50%|█████     | 528/1047 [00:41<00:37, 13.87it/s]Qunatization Process:  51%|█████     | 531/1047 [00:41<00:35, 14.52it/s]Qunatization Process:  51%|█████     | 534/1047 [00:42<00:33, 15.13it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:42<00:32, 15.65it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:42<00:40, 12.37it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:43<00:50,  9.99it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:43<00:57,  8.71it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:43<00:35, 13.88it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:44<00:33, 14.53it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:44<00:32, 15.13it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:44<00:30, 15.65it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:44<00:38, 12.37it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:45<00:47,  9.99it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:45<00:54,  8.71it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:45<00:33, 13.88it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:46<00:31, 14.54it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:46<00:30, 15.14it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:46<00:29, 15.66it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:46<00:36, 12.34it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:47<00:45,  9.99it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:47<00:51,  8.72it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:48<00:31, 13.88it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:48<00:30, 14.54it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:48<00:28, 15.14it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:48<00:27, 15.67it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:49<00:34, 12.38it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:49<00:42, 10.00it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:49<00:48,  8.72it/s]Qunatization Process:  60%|██████    | 632/1047 [00:50<00:29, 13.88it/s]Qunatization Process:  61%|██████    | 635/1047 [00:50<00:28, 14.54it/s]Qunatization Process:  61%|██████    | 638/1047 [00:50<00:27, 15.14it/s]Qunatization Process:  61%|██████    | 641/1047 [00:50<00:25, 15.66it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:51<00:32, 12.36it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:51<00:39, 10.00it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:52<00:45,  8.72it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:52<00:28, 13.89it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:52<00:26, 14.55it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:52<00:25, 15.15it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:52<00:24, 15.62it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:53<00:30, 12.35it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:53<00:37,  9.99it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:54<00:42,  8.72it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:54<00:26, 13.88it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:54<00:24, 14.53it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:54<00:23, 15.14it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:54<00:22, 15.66it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:55<00:28, 12.36it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:55<00:34, 10.00it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:56<00:39,  8.72it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:56<00:24, 13.89it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:56<00:22, 14.54it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:56<00:21, 15.15it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:56<00:20, 15.68it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:57<00:26, 12.38it/s]Qunatization Process:  69%|██████▉   | 726/1047 [00:57<00:32, 10.01it/s]Qunatization Process:  70%|██████▉   | 729/1047 [00:58<00:36,  8.73it/s]Qunatization Process:  70%|███████   | 736/1047 [00:58<00:22, 13.90it/s]Qunatization Process:  71%|███████   | 739/1047 [00:58<00:21, 14.55it/s]Qunatization Process:  71%|███████   | 742/1047 [00:58<00:20, 15.12it/s]Qunatization Process:  71%|███████   | 745/1047 [00:58<00:19, 15.64it/s]Qunatization Process:  72%|███████▏  | 749/1047 [00:59<00:24, 12.36it/s]Qunatization Process:  72%|███████▏  | 752/1047 [00:59<00:29, 10.00it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:00<00:33,  8.72it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:00<00:20, 13.89it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:00<00:19, 14.54it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:00<00:18, 15.14it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:01<00:17, 15.65it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:01<00:22, 12.35it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:01<00:26,  9.99it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:02<00:30,  8.72it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:02<00:18, 13.87it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:02<00:17, 14.53it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:02<00:16, 15.13it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:03<00:15, 15.64it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:03<00:19, 12.35it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:04<00:24,  9.99it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:04<00:27,  8.72it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:04<00:16, 13.88it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:04<00:15, 14.51it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:05<00:15, 15.10it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:05<00:14, 15.62it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:05<00:17, 12.34it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:06<00:21,  9.99it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:06<00:24,  8.71it/s]Qunatization Process:  80%|████████  | 840/1047 [01:06<00:14, 13.87it/s]Qunatization Process:  81%|████████  | 843/1047 [01:06<00:14, 14.53it/s]Qunatization Process:  81%|████████  | 846/1047 [01:07<00:13, 15.13it/s]Qunatization Process:  81%|████████  | 849/1047 [01:07<00:12, 15.64it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:07<00:15, 12.36it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:08<00:19, 10.01it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:08<00:21,  8.73it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:08<00:13, 13.90it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:08<00:12, 14.56it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:09<00:11, 15.16it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:09<00:10, 15.68it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:09<00:13, 12.42it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:10<00:16, 10.06it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:10<00:18,  8.76it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:10<00:11, 14.02it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:11<00:10, 14.77it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:11<00:09, 15.48it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:11<00:09, 16.10it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:11<00:11, 12.78it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:12<00:13, 10.42it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:12<00:15,  9.06it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:12<00:08, 14.41it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:13<00:08, 15.13it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:13<00:07, 15.79it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:13<00:07, 16.36it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:13<00:09, 12.88it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:14<00:10, 10.33it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:14<00:12,  8.85it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:14<00:07, 14.01it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:15<00:06, 14.58it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:15<00:06, 15.13it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:15<00:06, 15.54it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:15<00:07, 12.20it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:16<00:08,  9.81it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:16<00:09,  8.60it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:17<00:05, 13.72it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:17<00:05, 14.31it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:17<00:04, 14.89it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:17<00:04, 15.34it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:18<00:05, 12.12it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:18<00:06,  9.77it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:18<00:06,  8.52it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:19<00:03, 13.54it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:19<00:03, 14.19it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:19<00:03, 14.75it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:19<00:02, 15.22it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:20<00:03, 12.11it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:20<00:03,  9.86it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:21<00:03,  8.58it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:21<00:01, 13.62it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:21<00:01, 14.27it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:21<00:01, 14.81it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:21<00:01, 15.28it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:22<00:00, 12.10it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:22<00:00,  9.79it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:23<00:00,  8.52it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:24<00:00,  6.84it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:24<00:00, 12.41it/s]
+[06:19:29.677979] ## Processing on RANK 6.
+[06:21:03.561777] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:21:34.677162] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:22, 45.77it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:28, 35.78it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:50, 20.53it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:13, 14.02it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:35, 10.66it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:53,  9.01it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:11, 14.23it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:08, 14.80it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:05, 15.32it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:03, 15.75it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:21, 12.28it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:40,  9.87it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:55,  8.60it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:12, 13.67it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:08, 14.33it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:05, 14.93it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:03, 15.45it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:20, 12.10it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:39,  9.79it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:53,  8.53it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:10, 13.60it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:07, 14.27it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:04, 14.88it/s]Qunatization Process:   9%|▉         | 95/1047 [00:07<01:02, 15.35it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:18, 12.15it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:36,  9.81it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:50,  8.54it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:08, 13.62it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:05, 14.28it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:08<01:02, 14.89it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<01:01, 15.17it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:18, 11.77it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:37,  9.42it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:52,  8.17it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:09, 13.01it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:11<01:06, 13.62it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:11<01:03, 14.14it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<01:01, 14.64it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:17, 11.54it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:35,  9.35it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:49,  8.15it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:13<01:07, 12.99it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:13<01:04, 13.58it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<01:01, 14.16it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:59, 14.68it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:14<01:14, 11.62it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:32,  9.40it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:15<01:45,  8.21it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:15<01:05, 13.08it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:15<01:02, 13.69it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<00:59, 14.27it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:57, 14.77it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:16<01:12, 11.66it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:29,  9.43it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:17<01:41,  8.23it/s]Qunatization Process:  21%|██        | 216/1047 [00:17<01:03, 13.11it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<01:00, 13.73it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:57, 14.28it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:18<00:55, 14.78it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:18<01:10, 11.67it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:19<01:26,  9.41it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:19<01:38,  8.21it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:19<01:01, 13.09it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:19<00:58, 13.71it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:20<00:55, 14.29it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:20<00:53, 14.74it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:20<01:08, 11.49it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:21<01:26,  9.08it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:21<01:37,  8.05it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:21<01:00, 12.90it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:22<00:57, 13.61it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:22<00:54, 14.23it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:22<00:52, 14.78it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:22<01:05, 11.78it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:23<01:19,  9.54it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:23<01:31,  8.29it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:24<00:57, 13.19it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:24<00:54, 13.69it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:24<00:52, 14.31it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:24<00:50, 14.81it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:25<01:02, 11.78it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:25<01:16,  9.63it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:26<01:27,  8.35it/s]Qunatization Process:  31%|███       | 320/1047 [00:26<00:54, 13.22it/s]Qunatization Process:  31%|███       | 323/1047 [00:26<00:57, 12.62it/s]Qunatization Process:  31%|███       | 326/1047 [00:26<00:54, 13.29it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:26<00:51, 13.88it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:27<01:03, 11.16it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:27<01:18,  9.08it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:28<01:29,  7.94it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:28<00:55, 12.69it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:28<00:52, 13.30it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:29<00:50, 13.86it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:29<00:48, 14.32it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:29<00:59, 11.60it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:30<01:13,  9.33it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:30<01:23,  8.12it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:30<00:52, 12.90it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:31<00:49, 13.47it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:31<00:47, 14.09it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:31<00:46, 14.38it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:31<00:57, 11.46it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:32<01:09,  9.47it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:32<01:18,  8.38it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:33<00:48, 13.42it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:33<00:46, 14.02it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:33<00:43, 14.70it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:33<00:41, 15.29it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:34<00:52, 12.13it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:34<01:05,  9.70it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:35<01:15,  8.36it/s]Qunatization Process:  40%|████      | 424/1047 [00:35<00:46, 13.28it/s]Qunatization Process:  41%|████      | 427/1047 [00:35<00:44, 13.94it/s]Qunatization Process:  41%|████      | 430/1047 [00:35<00:42, 14.59it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:35<00:41, 14.89it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:36<00:51, 11.76it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:36<01:03,  9.52it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:37<01:12,  8.30it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:37<00:44, 13.28it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:37<00:42, 13.98it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:37<00:40, 14.54it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:37<00:38, 15.13it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:38<00:49, 11.90it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:38<01:01,  9.51it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:39<01:09,  8.29it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:39<00:42, 13.28it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:39<00:40, 13.98it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:39<00:39, 14.39it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:40<00:37, 15.01it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:40<00:46, 11.99it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:41<00:57,  9.64it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:41<01:05,  8.44it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:41<00:40, 13.48it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:41<00:38, 14.15it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:42<00:36, 14.74it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:42<00:35, 15.19it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:42<00:44, 11.96it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:43<00:54,  9.71it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:43<01:02,  8.47it/s]Qunatization Process:  50%|█████     | 528/1047 [00:43<00:38, 13.39it/s]Qunatization Process:  51%|█████     | 531/1047 [00:44<00:36, 14.06it/s]Qunatization Process:  51%|█████     | 534/1047 [00:44<00:34, 14.70it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:44<00:33, 15.05it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:44<00:42, 11.87it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:45<00:51,  9.69it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:45<00:59,  8.40it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:46<00:37, 13.27it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:46<00:35, 13.98it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:46<00:33, 14.63it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:46<00:31, 15.20it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:47<00:40, 11.90it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:47<00:49,  9.68it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:47<00:56,  8.43it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:48<00:34, 13.47it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:48<00:32, 14.11it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:48<00:31, 14.69it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:48<00:30, 15.19it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:49<00:38, 11.89it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:49<00:47,  9.59it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:50<00:53,  8.43it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:50<00:32, 13.46it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:50<00:31, 14.13it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:50<00:29, 14.75it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:50<00:28, 15.28it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:51<00:35, 11.95it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:51<00:44,  9.61it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:52<00:50,  8.35it/s]Qunatization Process:  60%|██████    | 632/1047 [00:52<00:31, 13.36it/s]Qunatization Process:  61%|██████    | 635/1047 [00:52<00:29, 14.05it/s]Qunatization Process:  61%|██████    | 638/1047 [00:52<00:27, 14.64it/s]Qunatization Process:  61%|██████    | 641/1047 [00:53<00:26, 15.18it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:53<00:33, 12.06it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:53<00:40,  9.77it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:54<00:46,  8.52it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:54<00:28, 13.58it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:54<00:27, 14.13it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:54<00:25, 14.75it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:55<00:24, 15.29it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:55<00:31, 12.08it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:56<00:38,  9.77it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:56<00:43,  8.52it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:56<00:26, 13.58it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:56<00:25, 14.08it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:57<00:24, 14.71it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:57<00:23, 15.24it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:57<00:29, 11.97it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:58<00:35,  9.64it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:58<00:40,  8.46it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:58<00:25, 13.32it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:59<00:23, 14.01it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:59<00:22, 14.66it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:59<00:21, 15.21it/s]Qunatization Process:  69%|██████▉   | 723/1047 [00:59<00:26, 12.04it/s]Qunatization Process:  69%|██████▉   | 726/1047 [01:00<00:32,  9.74it/s]Qunatization Process:  70%|██████▉   | 729/1047 [01:00<00:37,  8.51it/s]Qunatization Process:  70%|███████   | 736/1047 [01:01<00:22, 13.57it/s]Qunatization Process:  71%|███████   | 739/1047 [01:01<00:21, 14.19it/s]Qunatization Process:  71%|███████   | 742/1047 [01:01<00:20, 14.80it/s]Qunatization Process:  71%|███████   | 745/1047 [01:01<00:19, 15.33it/s]Qunatization Process:  72%|███████▏  | 749/1047 [01:02<00:24, 12.06it/s]Qunatization Process:  72%|███████▏  | 752/1047 [01:02<00:30,  9.75it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:02<00:34,  8.50it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:03<00:21, 13.57it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:03<00:19, 14.19it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:03<00:18, 14.80it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:03<00:18, 15.27it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:04<00:22, 12.06it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:04<00:27,  9.75it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:05<00:31,  8.50it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:05<00:19, 13.53it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:05<00:18, 14.19it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:05<00:17, 14.81it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:05<00:16, 15.07it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:06<00:20, 12.01it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:06<00:25,  9.64it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:07<00:28,  8.38it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:07<00:17, 13.40it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:07<00:16, 14.08it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:07<00:15, 14.70it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:07<00:14, 15.25it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:08<00:18, 11.93it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:08<00:22,  9.60it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:09<00:25,  8.44it/s]Qunatization Process:  80%|████████  | 840/1047 [01:09<00:15, 13.47it/s]Qunatization Process:  81%|████████  | 843/1047 [01:09<00:14, 13.97it/s]Qunatization Process:  81%|████████  | 846/1047 [01:09<00:13, 14.63it/s]Qunatization Process:  81%|████████  | 849/1047 [01:10<00:13, 15.21it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:10<00:16, 12.02it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:11<00:19,  9.78it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:11<00:21,  8.55it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:11<00:13, 13.64it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:11<00:12, 14.30it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:12<00:11, 14.86it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:12<00:11, 15.40it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:12<00:13, 12.12it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:13<00:16,  9.85it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:13<00:18,  8.61it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:13<00:11, 13.70it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:13<00:10, 14.35it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:14<00:09, 14.95it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:14<00:09, 15.25it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:14<00:11, 12.14it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:15<00:14,  9.86it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:15<00:15,  8.54it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:15<00:09, 13.62it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:16<00:08, 14.29it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:16<00:08, 14.90it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:16<00:07, 15.39it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:16<00:09, 12.21it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:17<00:11,  9.88it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:17<00:12,  8.61it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:18<00:07, 13.71it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:18<00:06, 14.34it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:18<00:06, 14.94it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:18<00:06, 15.46it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:19<00:07, 12.20it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:19<00:08,  9.87it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:19<00:09,  8.59it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:20<00:05, 13.70it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:20<00:05, 14.35it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:20<00:04, 14.95it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:20<00:04, 15.47it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:21<00:05, 12.20it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:21<00:06,  9.86it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:22<00:06,  8.59it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:22<00:03, 13.69it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:22<00:03, 14.34it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:22<00:03, 14.89it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:22<00:02, 15.42it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:23<00:03, 12.18it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:23<00:03,  9.85it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:24<00:03,  8.61it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:24<00:01, 13.68it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:24<00:01, 14.33it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:24<00:01, 14.93it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:24<00:01, 15.39it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:25<00:00, 12.20it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:25<00:00,  9.86it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:26<00:00,  8.59it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:27<00:00,  6.86it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:27<00:00, 11.98it/s]
+[06:23:23.538085] ## Processing on RANK 7.
+[06:24:57.926781] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[06:25:30.191183] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/1047 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/1047 [00:00<00:21, 47.63it/s]Qunatization Process:   1%|          | 13/1047 [00:00<00:27, 37.22it/s]Qunatization Process:   2%|▏         | 17/1047 [00:00<00:47, 21.52it/s]Qunatization Process:   2%|▏         | 21/1047 [00:01<01:10, 14.63it/s]Qunatization Process:   2%|▏         | 24/1047 [00:01<01:32, 11.05it/s]Qunatization Process:   3%|▎         | 27/1047 [00:02<01:49,  9.29it/s]Qunatization Process:   3%|▎         | 34/1047 [00:02<01:08, 14.76it/s]Qunatization Process:   4%|▎         | 37/1047 [00:02<01:05, 15.39it/s]Qunatization Process:   4%|▍         | 40/1047 [00:02<01:03, 15.97it/s]Qunatization Process:   4%|▍         | 43/1047 [00:02<01:01, 16.36it/s]Qunatization Process:   4%|▍         | 47/1047 [00:03<01:19, 12.59it/s]Qunatization Process:   5%|▍         | 50/1047 [00:03<01:38, 10.10it/s]Qunatization Process:   5%|▌         | 53/1047 [00:04<01:53,  8.74it/s]Qunatization Process:   6%|▌         | 60/1047 [00:04<01:10, 13.96it/s]Qunatization Process:   6%|▌         | 63/1047 [00:04<01:07, 14.65it/s]Qunatization Process:   6%|▋         | 66/1047 [00:04<01:04, 15.24it/s]Qunatization Process:   7%|▋         | 69/1047 [00:04<01:01, 15.81it/s]Qunatization Process:   7%|▋         | 73/1047 [00:05<01:20, 12.05it/s]Qunatization Process:   7%|▋         | 76/1047 [00:05<01:41,  9.61it/s]Qunatization Process:   8%|▊         | 79/1047 [00:06<01:56,  8.30it/s]Qunatization Process:   8%|▊         | 86/1047 [00:06<01:12, 13.22it/s]Qunatization Process:   9%|▊         | 89/1047 [00:06<01:09, 13.88it/s]Qunatization Process:   9%|▉         | 92/1047 [00:06<01:05, 14.48it/s]Qunatization Process:   9%|▉         | 95/1047 [00:06<01:03, 15.00it/s]Qunatization Process:   9%|▉         | 99/1047 [00:07<01:20, 11.73it/s]Qunatization Process:  10%|▉         | 102/1047 [00:07<01:40,  9.43it/s]Qunatization Process:  10%|█         | 105/1047 [00:08<01:54,  8.20it/s]Qunatization Process:  11%|█         | 112/1047 [00:08<01:11, 13.11it/s]Qunatization Process:  11%|█         | 115/1047 [00:08<01:07, 13.78it/s]Qunatization Process:  11%|█▏        | 118/1047 [00:09<01:04, 14.37it/s]Qunatization Process:  12%|█▏        | 121/1047 [00:09<01:02, 14.91it/s]Qunatization Process:  12%|█▏        | 125/1047 [00:09<01:18, 11.72it/s]Qunatization Process:  12%|█▏        | 128/1047 [00:10<01:37,  9.45it/s]Qunatization Process:  13%|█▎        | 131/1047 [00:10<01:51,  8.23it/s]Qunatization Process:  13%|█▎        | 138/1047 [00:10<01:09, 13.13it/s]Qunatization Process:  13%|█▎        | 141/1047 [00:11<01:06, 13.68it/s]Qunatization Process:  14%|█▍        | 144/1047 [00:11<01:08, 13.18it/s]Qunatization Process:  14%|█▍        | 147/1047 [00:11<01:04, 13.93it/s]Qunatization Process:  14%|█▍        | 151/1047 [00:11<01:19, 11.33it/s]Qunatization Process:  15%|█▍        | 154/1047 [00:12<01:35,  9.36it/s]Qunatization Process:  15%|█▍        | 157/1047 [00:12<01:48,  8.22it/s]Qunatization Process:  16%|█▌        | 164/1047 [00:13<01:07, 13.14it/s]Qunatization Process:  16%|█▌        | 167/1047 [00:13<01:03, 13.79it/s]Qunatization Process:  16%|█▌        | 170/1047 [00:13<01:01, 14.32it/s]Qunatization Process:  17%|█▋        | 173/1047 [00:13<00:58, 14.90it/s]Qunatization Process:  17%|█▋        | 177/1047 [00:14<01:13, 11.81it/s]Qunatization Process:  17%|█▋        | 180/1047 [00:14<01:38,  8.82it/s]Qunatization Process:  17%|█▋        | 183/1047 [00:15<01:49,  7.87it/s]Qunatization Process:  18%|█▊        | 190/1047 [00:15<01:07, 12.66it/s]Qunatization Process:  18%|█▊        | 193/1047 [00:15<01:04, 13.34it/s]Qunatization Process:  19%|█▊        | 196/1047 [00:15<01:00, 14.00it/s]Qunatization Process:  19%|█▉        | 199/1047 [00:15<00:58, 14.52it/s]Qunatization Process:  19%|█▉        | 203/1047 [00:16<01:13, 11.46it/s]Qunatization Process:  20%|█▉        | 206/1047 [00:16<01:30,  9.28it/s]Qunatization Process:  20%|█▉        | 209/1047 [00:17<01:43,  8.08it/s]Qunatization Process:  21%|██        | 216/1047 [00:17<01:04, 12.89it/s]Qunatization Process:  21%|██        | 219/1047 [00:17<01:01, 13.56it/s]Qunatization Process:  21%|██        | 222/1047 [00:17<00:58, 14.13it/s]Qunatization Process:  21%|██▏       | 225/1047 [00:18<00:56, 14.63it/s]Qunatization Process:  22%|██▏       | 229/1047 [00:18<01:10, 11.53it/s]Qunatization Process:  22%|██▏       | 232/1047 [00:19<01:27,  9.29it/s]Qunatization Process:  22%|██▏       | 235/1047 [00:19<01:40,  8.10it/s]Qunatization Process:  23%|██▎       | 242/1047 [00:19<01:02, 12.98it/s]Qunatization Process:  23%|██▎       | 245/1047 [00:20<00:58, 13.67it/s]Qunatization Process:  24%|██▎       | 248/1047 [00:20<00:55, 14.31it/s]Qunatization Process:  24%|██▍       | 251/1047 [00:20<00:53, 14.87it/s]Qunatization Process:  24%|██▍       | 255/1047 [00:20<01:08, 11.64it/s]Qunatization Process:  25%|██▍       | 258/1047 [00:21<01:24,  9.38it/s]Qunatization Process:  25%|██▍       | 261/1047 [00:21<01:36,  8.15it/s]Qunatization Process:  26%|██▌       | 268/1047 [00:22<00:59, 13.14it/s]Qunatization Process:  26%|██▌       | 271/1047 [00:22<00:56, 13.86it/s]Qunatization Process:  26%|██▌       | 274/1047 [00:22<00:53, 14.57it/s]Qunatization Process:  26%|██▋       | 277/1047 [00:22<00:51, 14.96it/s]Qunatization Process:  27%|██▋       | 281/1047 [00:23<01:05, 11.63it/s]Qunatization Process:  27%|██▋       | 284/1047 [00:23<01:21,  9.37it/s]Qunatization Process:  27%|██▋       | 287/1047 [00:24<01:33,  8.12it/s]Qunatization Process:  28%|██▊       | 294/1047 [00:24<00:57, 13.00it/s]Qunatization Process:  28%|██▊       | 297/1047 [00:24<00:55, 13.60it/s]Qunatization Process:  29%|██▊       | 300/1047 [00:24<00:52, 14.20it/s]Qunatization Process:  29%|██▉       | 303/1047 [00:24<00:50, 14.69it/s]Qunatization Process:  29%|██▉       | 307/1047 [00:25<01:03, 11.58it/s]Qunatization Process:  30%|██▉       | 310/1047 [00:25<01:18,  9.42it/s]Qunatization Process:  30%|██▉       | 313/1047 [00:26<01:29,  8.20it/s]Qunatization Process:  31%|███       | 320/1047 [00:26<00:55, 13.11it/s]Qunatization Process:  31%|███       | 323/1047 [00:26<00:52, 13.74it/s]Qunatization Process:  31%|███       | 326/1047 [00:26<00:50, 14.35it/s]Qunatization Process:  31%|███▏      | 329/1047 [00:26<00:48, 14.89it/s]Qunatization Process:  32%|███▏      | 333/1047 [00:27<01:00, 11.83it/s]Qunatization Process:  32%|███▏      | 336/1047 [00:27<01:14,  9.54it/s]Qunatization Process:  32%|███▏      | 339/1047 [00:28<01:24,  8.33it/s]Qunatization Process:  33%|███▎      | 346/1047 [00:28<00:53, 13.20it/s]Qunatization Process:  33%|███▎      | 349/1047 [00:28<00:50, 13.91it/s]Qunatization Process:  34%|███▎      | 352/1047 [00:28<00:48, 14.34it/s]Qunatization Process:  34%|███▍      | 355/1047 [00:29<00:46, 14.97it/s]Qunatization Process:  34%|███▍      | 359/1047 [00:29<00:59, 11.64it/s]Qunatization Process:  35%|███▍      | 362/1047 [00:30<01:13,  9.34it/s]Qunatization Process:  35%|███▍      | 365/1047 [00:30<01:24,  8.10it/s]Qunatization Process:  36%|███▌      | 372/1047 [00:30<00:52, 12.85it/s]Qunatization Process:  36%|███▌      | 375/1047 [00:31<00:49, 13.54it/s]Qunatization Process:  36%|███▌      | 378/1047 [00:31<00:47, 14.14it/s]Qunatization Process:  36%|███▋      | 381/1047 [00:31<00:45, 14.61it/s]Qunatization Process:  37%|███▋      | 385/1047 [00:31<00:57, 11.50it/s]Qunatization Process:  37%|███▋      | 388/1047 [00:32<01:11,  9.23it/s]Qunatization Process:  37%|███▋      | 391/1047 [00:32<01:21,  8.05it/s]Qunatization Process:  38%|███▊      | 398/1047 [00:33<00:50, 12.89it/s]Qunatization Process:  38%|███▊      | 401/1047 [00:33<00:48, 13.46it/s]Qunatization Process:  39%|███▊      | 404/1047 [00:33<00:45, 14.11it/s]Qunatization Process:  39%|███▉      | 407/1047 [00:33<00:43, 14.61it/s]Qunatization Process:  39%|███▉      | 411/1047 [00:34<00:54, 11.61it/s]Qunatization Process:  40%|███▉      | 414/1047 [00:34<01:07,  9.43it/s]Qunatization Process:  40%|███▉      | 417/1047 [00:35<01:16,  8.22it/s]Qunatization Process:  40%|████      | 424/1047 [00:35<00:47, 13.13it/s]Qunatization Process:  41%|████      | 427/1047 [00:35<00:44, 13.79it/s]Qunatization Process:  41%|████      | 430/1047 [00:35<00:43, 14.31it/s]Qunatization Process:  41%|████▏     | 433/1047 [00:35<00:41, 14.85it/s]Qunatization Process:  42%|████▏     | 437/1047 [00:36<00:52, 11.70it/s]Qunatization Process:  42%|████▏     | 440/1047 [00:36<01:04,  9.43it/s]Qunatization Process:  42%|████▏     | 443/1047 [00:37<01:13,  8.17it/s]Qunatization Process:  43%|████▎     | 450/1047 [00:37<00:46, 12.94it/s]Qunatization Process:  43%|████▎     | 453/1047 [00:37<00:43, 13.61it/s]Qunatization Process:  44%|████▎     | 456/1047 [00:37<00:42, 14.05it/s]Qunatization Process:  44%|████▍     | 459/1047 [00:38<00:40, 14.63it/s]Qunatization Process:  44%|████▍     | 463/1047 [00:38<00:50, 11.49it/s]Qunatization Process:  45%|████▍     | 466/1047 [00:39<01:02,  9.27it/s]Qunatization Process:  45%|████▍     | 469/1047 [00:39<01:11,  8.07it/s]Qunatization Process:  45%|████▌     | 476/1047 [00:39<00:44, 12.96it/s]Qunatization Process:  46%|████▌     | 479/1047 [00:39<00:41, 13.68it/s]Qunatization Process:  46%|████▌     | 482/1047 [00:40<00:39, 14.35it/s]Qunatization Process:  46%|████▋     | 485/1047 [00:40<00:37, 14.97it/s]Qunatization Process:  47%|████▋     | 489/1047 [00:40<00:47, 11.85it/s]Qunatization Process:  47%|████▋     | 492/1047 [00:41<00:57,  9.59it/s]Qunatization Process:  47%|████▋     | 495/1047 [00:41<01:06,  8.34it/s]Qunatization Process:  48%|████▊     | 502/1047 [00:41<00:40, 13.35it/s]Qunatization Process:  48%|████▊     | 505/1047 [00:42<00:39, 13.76it/s]Qunatization Process:  49%|████▊     | 508/1047 [00:42<00:37, 14.45it/s]Qunatization Process:  49%|████▉     | 511/1047 [00:42<00:35, 15.07it/s]Qunatization Process:  49%|████▉     | 515/1047 [00:42<00:44, 11.86it/s]Qunatization Process:  49%|████▉     | 518/1047 [00:43<00:55,  9.57it/s]Qunatization Process:  50%|████▉     | 521/1047 [00:43<01:03,  8.33it/s]Qunatization Process:  50%|█████     | 528/1047 [00:44<00:39, 13.26it/s]Qunatization Process:  51%|█████     | 531/1047 [00:44<00:36, 13.96it/s]Qunatization Process:  51%|█████     | 534/1047 [00:44<00:35, 14.61it/s]Qunatization Process:  51%|█████▏    | 537/1047 [00:44<00:33, 15.18it/s]Qunatization Process:  52%|█████▏    | 541/1047 [00:45<00:45, 11.05it/s]Qunatization Process:  52%|█████▏    | 544/1047 [00:45<00:54,  9.31it/s]Qunatization Process:  52%|█████▏    | 547/1047 [00:46<01:00,  8.23it/s]Qunatization Process:  53%|█████▎    | 554/1047 [00:46<00:37, 13.18it/s]Qunatization Process:  53%|█████▎    | 557/1047 [00:46<00:35, 13.76it/s]Qunatization Process:  53%|█████▎    | 560/1047 [00:46<00:33, 14.49it/s]Qunatization Process:  54%|█████▍    | 563/1047 [00:46<00:31, 15.14it/s]Qunatization Process:  54%|█████▍    | 567/1047 [00:47<00:40, 11.86it/s]Qunatization Process:  54%|█████▍    | 570/1047 [00:47<00:49,  9.63it/s]Qunatization Process:  55%|█████▍    | 573/1047 [00:48<00:56,  8.41it/s]Qunatization Process:  55%|█████▌    | 580/1047 [00:48<00:34, 13.47it/s]Qunatization Process:  56%|█████▌    | 583/1047 [00:48<00:33, 14.06it/s]Qunatization Process:  56%|█████▌    | 586/1047 [00:48<00:31, 14.75it/s]Qunatization Process:  56%|█████▋    | 589/1047 [00:48<00:29, 15.36it/s]Qunatization Process:  57%|█████▋    | 593/1047 [00:49<00:37, 12.06it/s]Qunatization Process:  57%|█████▋    | 596/1047 [00:49<00:46,  9.80it/s]Qunatization Process:  57%|█████▋    | 599/1047 [00:50<00:52,  8.60it/s]Qunatization Process:  58%|█████▊    | 606/1047 [00:50<00:32, 13.73it/s]Qunatization Process:  58%|█████▊    | 609/1047 [00:50<00:30, 14.38it/s]Qunatization Process:  58%|█████▊    | 612/1047 [00:50<00:29, 14.76it/s]Qunatization Process:  59%|█████▊    | 615/1047 [00:51<00:28, 15.36it/s]Qunatization Process:  59%|█████▉    | 619/1047 [00:51<00:35, 12.08it/s]Qunatization Process:  59%|█████▉    | 622/1047 [00:52<00:43,  9.75it/s]Qunatization Process:  60%|█████▉    | 625/1047 [00:52<00:49,  8.47it/s]Qunatization Process:  60%|██████    | 632/1047 [00:52<00:30, 13.56it/s]Qunatization Process:  61%|██████    | 635/1047 [00:52<00:29, 14.08it/s]Qunatization Process:  61%|██████    | 638/1047 [00:53<00:27, 14.76it/s]Qunatization Process:  61%|██████    | 641/1047 [00:53<00:26, 15.37it/s]Qunatization Process:  62%|██████▏   | 645/1047 [00:53<00:33, 12.14it/s]Qunatization Process:  62%|██████▏   | 648/1047 [00:54<00:40,  9.88it/s]Qunatization Process:  62%|██████▏   | 651/1047 [00:54<00:45,  8.64it/s]Qunatization Process:  63%|██████▎   | 658/1047 [00:54<00:28, 13.76it/s]Qunatization Process:  63%|██████▎   | 661/1047 [00:54<00:26, 14.44it/s]Qunatization Process:  63%|██████▎   | 664/1047 [00:55<00:25, 15.06it/s]Qunatization Process:  64%|██████▎   | 667/1047 [00:55<00:24, 15.50it/s]Qunatization Process:  64%|██████▍   | 671/1047 [00:55<00:30, 12.23it/s]Qunatization Process:  64%|██████▍   | 674/1047 [00:56<00:37,  9.90it/s]Qunatization Process:  65%|██████▍   | 677/1047 [00:56<00:42,  8.65it/s]Qunatization Process:  65%|██████▌   | 684/1047 [00:56<00:26, 13.68it/s]Qunatization Process:  66%|██████▌   | 687/1047 [00:57<00:25, 14.37it/s]Qunatization Process:  66%|██████▌   | 690/1047 [00:57<00:24, 14.83it/s]Qunatization Process:  66%|██████▌   | 693/1047 [00:57<00:22, 15.42it/s]Qunatization Process:  67%|██████▋   | 697/1047 [00:57<00:28, 12.13it/s]Qunatization Process:  67%|██████▋   | 700/1047 [00:58<00:35,  9.78it/s]Qunatization Process:  67%|██████▋   | 703/1047 [00:58<00:40,  8.57it/s]Qunatization Process:  68%|██████▊   | 710/1047 [00:59<00:24, 13.66it/s]Qunatization Process:  68%|██████▊   | 713/1047 [00:59<00:23, 14.32it/s]Qunatization Process:  68%|██████▊   | 716/1047 [00:59<00:22, 14.92it/s]Qunatization Process:  69%|██████▊   | 719/1047 [00:59<00:21, 15.50it/s]Qunatization Process:  69%|██████▉   | 723/1047 [01:00<00:26, 12.25it/s]Qunatization Process:  69%|██████▉   | 726/1047 [01:00<00:32,  9.93it/s]Qunatization Process:  70%|██████▉   | 729/1047 [01:00<00:36,  8.66it/s]Qunatization Process:  70%|███████   | 736/1047 [01:01<00:22, 13.78it/s]Qunatization Process:  71%|███████   | 739/1047 [01:01<00:21, 14.42it/s]Qunatization Process:  71%|███████   | 742/1047 [01:01<00:20, 15.00it/s]Qunatization Process:  71%|███████   | 745/1047 [01:01<00:19, 15.49it/s]Qunatization Process:  72%|███████▏  | 749/1047 [01:02<00:24, 12.11it/s]Qunatization Process:  72%|███████▏  | 752/1047 [01:02<00:29,  9.86it/s]Qunatization Process:  72%|███████▏  | 755/1047 [01:03<00:34,  8.55it/s]Qunatization Process:  73%|███████▎  | 762/1047 [01:03<00:21, 13.52it/s]Qunatization Process:  73%|███████▎  | 765/1047 [01:03<00:19, 14.23it/s]Qunatization Process:  73%|███████▎  | 768/1047 [01:03<00:18, 14.90it/s]Qunatization Process:  74%|███████▎  | 771/1047 [01:03<00:17, 15.48it/s]Qunatization Process:  74%|███████▍  | 775/1047 [01:04<00:22, 12.14it/s]Qunatization Process:  74%|███████▍  | 778/1047 [01:04<00:27,  9.75it/s]Qunatization Process:  75%|███████▍  | 781/1047 [01:05<00:31,  8.49it/s]Qunatization Process:  75%|███████▌  | 788/1047 [01:05<00:19, 13.59it/s]Qunatization Process:  76%|███████▌  | 791/1047 [01:05<00:17, 14.29it/s]Qunatization Process:  76%|███████▌  | 794/1047 [01:05<00:16, 14.90it/s]Qunatization Process:  76%|███████▌  | 797/1047 [01:05<00:16, 15.35it/s]Qunatization Process:  77%|███████▋  | 801/1047 [01:06<00:20, 12.22it/s]Qunatization Process:  77%|███████▋  | 804/1047 [01:06<00:24,  9.90it/s]Qunatization Process:  77%|███████▋  | 807/1047 [01:07<00:27,  8.66it/s]Qunatization Process:  78%|███████▊  | 814/1047 [01:07<00:16, 13.74it/s]Qunatization Process:  78%|███████▊  | 817/1047 [01:07<00:15, 14.43it/s]Qunatization Process:  78%|███████▊  | 820/1047 [01:07<00:15, 15.06it/s]Qunatization Process:  79%|███████▊  | 823/1047 [01:07<00:14, 15.55it/s]Qunatization Process:  79%|███████▉  | 827/1047 [01:08<00:17, 12.32it/s]Qunatization Process:  79%|███████▉  | 830/1047 [01:08<00:21,  9.96it/s]Qunatization Process:  80%|███████▉  | 833/1047 [01:09<00:24,  8.67it/s]Qunatization Process:  80%|████████  | 840/1047 [01:09<00:15, 13.71it/s]Qunatization Process:  81%|████████  | 843/1047 [01:09<00:14, 14.41it/s]Qunatization Process:  81%|████████  | 846/1047 [01:09<00:13, 15.06it/s]Qunatization Process:  81%|████████  | 849/1047 [01:10<00:12, 15.61it/s]Qunatization Process:  81%|████████▏ | 853/1047 [01:10<00:15, 12.35it/s]Qunatization Process:  82%|████████▏ | 856/1047 [01:10<00:19,  9.92it/s]Qunatization Process:  82%|████████▏ | 859/1047 [01:11<00:21,  8.69it/s]Qunatization Process:  83%|████████▎ | 866/1047 [01:11<00:13, 13.82it/s]Qunatization Process:  83%|████████▎ | 869/1047 [01:11<00:12, 14.50it/s]Qunatization Process:  83%|████████▎ | 872/1047 [01:11<00:11, 15.13it/s]Qunatization Process:  84%|████████▎ | 875/1047 [01:12<00:11, 15.63it/s]Qunatization Process:  84%|████████▍ | 879/1047 [01:12<00:13, 12.37it/s]Qunatization Process:  84%|████████▍ | 882/1047 [01:13<00:16, 10.02it/s]Qunatization Process:  85%|████████▍ | 885/1047 [01:13<00:18,  8.73it/s]Qunatization Process:  85%|████████▌ | 892/1047 [01:13<00:11, 13.93it/s]Qunatization Process:  85%|████████▌ | 895/1047 [01:13<00:10, 14.55it/s]Qunatization Process:  86%|████████▌ | 898/1047 [01:14<00:09, 15.17it/s]Qunatization Process:  86%|████████▌ | 901/1047 [01:14<00:09, 15.66it/s]Qunatization Process:  86%|████████▋ | 905/1047 [01:14<00:11, 12.39it/s]Qunatization Process:  87%|████████▋ | 908/1047 [01:15<00:13, 10.05it/s]Qunatization Process:  87%|████████▋ | 911/1047 [01:15<00:15,  8.75it/s]Qunatization Process:  88%|████████▊ | 918/1047 [01:15<00:09, 13.96it/s]Qunatization Process:  88%|████████▊ | 921/1047 [01:15<00:08, 14.59it/s]Qunatization Process:  88%|████████▊ | 924/1047 [01:16<00:08, 15.21it/s]Qunatization Process:  89%|████████▊ | 927/1047 [01:16<00:07, 15.69it/s]Qunatization Process:  89%|████████▉ | 931/1047 [01:16<00:09, 12.40it/s]Qunatization Process:  89%|████████▉ | 934/1047 [01:17<00:11, 10.03it/s]Qunatization Process:  89%|████████▉ | 937/1047 [01:17<00:12,  8.74it/s]Qunatization Process:  90%|█████████ | 944/1047 [01:17<00:07, 13.91it/s]Qunatization Process:  90%|█████████ | 947/1047 [01:18<00:06, 14.59it/s]Qunatization Process:  91%|█████████ | 950/1047 [01:18<00:06, 15.21it/s]Qunatization Process:  91%|█████████ | 953/1047 [01:18<00:06, 15.49it/s]Qunatization Process:  91%|█████████▏| 957/1047 [01:18<00:07, 12.31it/s]Qunatization Process:  92%|█████████▏| 960/1047 [01:19<00:08,  9.90it/s]Qunatization Process:  92%|█████████▏| 963/1047 [01:19<00:09,  8.69it/s]Qunatization Process:  93%|█████████▎| 970/1047 [01:19<00:05, 13.71it/s]Qunatization Process:  93%|█████████▎| 973/1047 [01:20<00:05, 14.41it/s]Qunatization Process:  93%|█████████▎| 976/1047 [01:20<00:04, 15.07it/s]Qunatization Process:  94%|█████████▎| 979/1047 [01:20<00:04, 15.63it/s]Qunatization Process:  94%|█████████▍| 983/1047 [01:20<00:05, 12.25it/s]Qunatization Process:  94%|█████████▍| 986/1047 [01:21<00:06,  9.98it/s]Qunatization Process:  94%|█████████▍| 989/1047 [01:21<00:06,  8.65it/s]Qunatization Process:  95%|█████████▌| 996/1047 [01:22<00:03, 13.82it/s]Qunatization Process:  95%|█████████▌| 999/1047 [01:22<00:03, 14.31it/s]Qunatization Process:  96%|█████████▌| 1002/1047 [01:22<00:03, 14.98it/s]Qunatization Process:  96%|█████████▌| 1005/1047 [01:22<00:02, 15.56it/s]Qunatization Process:  96%|█████████▋| 1009/1047 [01:23<00:03, 12.22it/s]Qunatization Process:  97%|█████████▋| 1012/1047 [01:23<00:03,  9.96it/s]Qunatization Process:  97%|█████████▋| 1015/1047 [01:23<00:03,  8.64it/s]Qunatization Process:  98%|█████████▊| 1022/1047 [01:24<00:01, 13.80it/s]Qunatization Process:  98%|█████████▊| 1025/1047 [01:24<00:01, 14.49it/s]Qunatization Process:  98%|█████████▊| 1028/1047 [01:24<00:01, 15.13it/s]Qunatization Process:  98%|█████████▊| 1031/1047 [01:24<00:01, 15.64it/s]Qunatization Process:  99%|█████████▉| 1035/1047 [01:25<00:00, 12.39it/s]Qunatization Process:  99%|█████████▉| 1038/1047 [01:25<00:00, 10.05it/s]Qunatization Process:  99%|█████████▉| 1041/1047 [01:26<00:00,  8.75it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:27<00:00,  7.02it/s]Qunatization Process: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]
+[06:27:19.186004] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-39): 40 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=5120, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=13824, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+            (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+    )
+  )
+)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 2, which does not have an explicit index. FSDP will use the current device 2. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 5, which does not have an explicit index. FSDP will use the current device 5. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 7, which does not have an explicit index. FSDP will use the current device 7. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 6, which does not have an explicit index. FSDP will use the current device 6. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 3, which does not have an explicit index. FSDP will use the current device 3. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 4, which does not have an explicit index. FSDP will use the current device 4. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[06:27:21.345084] apply gradient checkpointing
+[06:27:21.408784] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-39): 40 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=5120, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=5120, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=13824, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=5120, bias=False)
+                (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=5120, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=13824, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+      )
+    )
+  )
+)
+[06:27:21.408877] effective batch size: 64
+[06:27:21.416037] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[06:27:21.416200] read dataset config from configs/data/finetune/sg/alpaca.yaml
+[06:27:21.417036] DATASET CONFIG:
+[06:27:21.417052] {'META': [['../data/alpaca_gpt4_data.json', 'text']]}
+[06:27:21.567169] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[06:27:21.569229] total length: 52002
+[06:27:21.581775] <data.alpaca.FinetuneDataset object at 0x7f290e86b130>
+[06:27:21.584239] Start training for 4 epochs
+[06:27:21.595470] log_dir: ./output_dir
+[06:27:27.291530] Epoch: [0]  [0/812]  lr: 0.000000  grad_norm: 1.9510 (1.9510)  closs: 1.0616 (1.0616)  time: 5.6952  data: 1.5912  max mem: 18825
+[06:27:48.233874] Epoch: [0]  [10/812]  lr: 0.000001  grad_norm: 2.1544 (2.1439)  closs: 1.0616 (1.0547)  time: 2.4215  data: 0.1448  max mem: 28042
+[06:28:09.492825] Epoch: [0]  [20/812]  lr: 0.000001  grad_norm: 2.0091 (2.0561)  closs: 0.9999 (1.0350)  time: 2.1100  data: 0.0002  max mem: 28042
+[06:28:30.750007] Epoch: [0]  [30/812]  lr: 0.000002  grad_norm: 1.9742 (2.0773)  closs: 1.0489 (1.0550)  time: 2.1257  data: 0.0002  max mem: 28042
+[06:28:51.915590] Epoch: [0]  [40/812]  lr: 0.000002  grad_norm: 1.9236 (2.0170)  closs: 1.0628 (1.0608)  time: 2.1210  data: 0.0002  max mem: 28042
+[06:29:13.261311] Epoch: [0]  [50/812]  lr: 0.000003  grad_norm: 1.6794 (1.9642)  closs: 1.0594 (1.0599)  time: 2.1255  data: 0.0002  max mem: 28042
+[06:29:34.413120] Epoch: [0]  [60/812]  lr: 0.000004  grad_norm: 1.5823 (1.8870)  closs: 1.0342 (1.0539)  time: 2.1248  data: 0.0002  max mem: 28042
+[06:29:55.626442] Epoch: [0]  [70/812]  lr: 0.000004  grad_norm: 1.4000 (1.8062)  closs: 1.0269 (1.0558)  time: 2.1181  data: 0.0002  max mem: 28042
+[06:30:16.918661] Epoch: [0]  [80/812]  lr: 0.000005  grad_norm: 1.2024 (1.7370)  closs: 1.0211 (1.0531)  time: 2.1252  data: 0.0002  max mem: 28042
+[06:30:38.196705] Epoch: [0]  [90/812]  lr: 0.000006  grad_norm: 1.1287 (1.6695)  closs: 0.9826 (1.0414)  time: 2.1284  data: 0.0002  max mem: 28042
+[06:30:59.467468] Epoch: [0]  [100/812]  lr: 0.000006  grad_norm: 1.0506 (1.6085)  closs: 0.9512 (1.0363)  time: 2.1274  data: 0.0002  max mem: 28042
+[06:31:20.727971] Epoch: [0]  [110/812]  lr: 0.000007  grad_norm: 0.9873 (1.5522)  closs: 0.9416 (1.0241)  time: 2.1265  data: 0.0002  max mem: 28042
+[06:31:41.936817] Epoch: [0]  [120/812]  lr: 0.000007  grad_norm: 0.9233 (1.5061)  closs: 0.9447 (1.0219)  time: 2.1234  data: 0.0002  max mem: 28042
+[06:32:03.041717] Epoch: [0]  [130/812]  lr: 0.000008  grad_norm: 0.9836 (1.4640)  closs: 0.9609 (1.0167)  time: 2.1156  data: 0.0002  max mem: 28042
+[06:32:24.295834] Epoch: [0]  [140/812]  lr: 0.000009  grad_norm: 0.9415 (1.4258)  closs: 0.8978 (1.0079)  time: 2.1179  data: 0.0002  max mem: 28042
+[06:32:45.543342] Epoch: [0]  [150/812]  lr: 0.000009  grad_norm: 0.8967 (1.3994)  closs: 0.8940 (1.0025)  time: 2.1250  data: 0.0002  max mem: 28042
+[06:33:06.837739] Epoch: [0]  [160/812]  lr: 0.000010  grad_norm: 0.9221 (1.3720)  closs: 0.9130 (0.9991)  time: 2.1270  data: 0.0002  max mem: 28042
+[06:33:28.086917] Epoch: [0]  [170/812]  lr: 0.000010  grad_norm: 0.9860 (1.3484)  closs: 0.9068 (0.9924)  time: 2.1271  data: 0.0002  max mem: 28042
+[06:33:49.368902] Epoch: [0]  [180/812]  lr: 0.000011  grad_norm: 0.9860 (1.3266)  closs: 0.8552 (0.9843)  time: 2.1265  data: 0.0002  max mem: 28042
+[06:34:10.555964] Epoch: [0]  [190/812]  lr: 0.000012  grad_norm: 0.9269 (1.3048)  closs: 0.8552 (0.9786)  time: 2.1234  data: 0.0002  max mem: 28042
+[06:34:31.919181] Epoch: [0]  [200/812]  lr: 0.000012  grad_norm: 0.9269 (1.2870)  closs: 0.9133 (0.9764)  time: 2.1274  data: 0.0002  max mem: 28042
+[06:34:53.191153] Epoch: [0]  [210/812]  lr: 0.000013  grad_norm: 0.9518 (1.2708)  closs: 0.9159 (0.9724)  time: 2.1317  data: 0.0002  max mem: 28042
+[06:35:14.531503] Epoch: [0]  [220/812]  lr: 0.000014  grad_norm: 0.9473 (1.2543)  closs: 0.9002 (0.9697)  time: 2.1305  data: 0.0002  max mem: 28042
+[06:35:35.775538] Epoch: [0]  [230/812]  lr: 0.000014  grad_norm: 0.8726 (1.2400)  closs: 0.8788 (0.9638)  time: 2.1291  data: 0.0002  max mem: 28042
+[06:35:57.064040] Epoch: [0]  [240/812]  lr: 0.000015  grad_norm: 0.8857 (1.2265)  closs: 0.8438 (0.9603)  time: 2.1266  data: 0.0002  max mem: 28042
+[06:36:18.204677] Epoch: [0]  [250/812]  lr: 0.000015  grad_norm: 0.8589 (1.2119)  closs: 0.9044 (0.9582)  time: 2.1214  data: 0.0002  max mem: 28042
+[06:36:39.553053] Epoch: [0]  [260/812]  lr: 0.000016  grad_norm: 0.8594 (1.2025)  closs: 0.8971 (0.9549)  time: 2.1244  data: 0.0002  max mem: 28042
+[06:37:00.855213] Epoch: [0]  [270/812]  lr: 0.000017  grad_norm: 0.9139 (1.1965)  closs: 0.8980 (0.9543)  time: 2.1324  data: 0.0002  max mem: 28042
+[06:37:22.140492] Epoch: [0]  [280/812]  lr: 0.000017  grad_norm: 0.9011 (1.1889)  closs: 0.9115 (0.9515)  time: 2.1293  data: 0.0002  max mem: 28042
+[06:37:43.447171] Epoch: [0]  [290/812]  lr: 0.000018  grad_norm: 0.9554 (1.1825)  closs: 0.8680 (0.9484)  time: 2.1295  data: 0.0002  max mem: 28042
+[06:38:04.736791] Epoch: [0]  [300/812]  lr: 0.000018  grad_norm: 0.9554 (1.1737)  closs: 0.8583 (0.9459)  time: 2.1297  data: 0.0002  max mem: 28042
+[06:38:25.924120] Epoch: [0]  [310/812]  lr: 0.000019  grad_norm: 0.8799 (1.1661)  closs: 0.8818 (0.9452)  time: 2.1238  data: 0.0002  max mem: 28042
+[06:38:47.257933] Epoch: [0]  [320/812]  lr: 0.000020  grad_norm: 0.9149 (1.1595)  closs: 0.9034 (0.9449)  time: 2.1260  data: 0.0002  max mem: 28042
+[06:39:08.559953] Epoch: [0]  [330/812]  lr: 0.000020  grad_norm: 0.9245 (1.1522)  closs: 0.8678 (0.9414)  time: 2.1317  data: 0.0002  max mem: 28042
+[06:39:29.852361] Epoch: [0]  [340/812]  lr: 0.000021  grad_norm: 0.9209 (1.1468)  closs: 0.8413 (0.9386)  time: 2.1296  data: 0.0002  max mem: 28042
+[06:39:51.145032] Epoch: [0]  [350/812]  lr: 0.000022  grad_norm: 0.9571 (1.1421)  closs: 0.8830 (0.9398)  time: 2.1292  data: 0.0002  max mem: 28042
+[06:40:12.364246] Epoch: [0]  [360/812]  lr: 0.000022  grad_norm: 0.9706 (1.1378)  closs: 0.8992 (0.9382)  time: 2.1255  data: 0.0002  max mem: 28042
+[06:40:33.511071] Epoch: [0]  [370/812]  lr: 0.000023  grad_norm: 0.8997 (1.1309)  closs: 0.8396 (0.9361)  time: 2.1182  data: 0.0002  max mem: 28042
+[06:40:54.788872] Epoch: [0]  [380/812]  lr: 0.000023  grad_norm: 0.8940 (1.1266)  closs: 0.8361 (0.9348)  time: 2.1212  data: 0.0002  max mem: 28042
+[06:41:16.033192] Epoch: [0]  [390/812]  lr: 0.000024  grad_norm: 0.9331 (1.1217)  closs: 0.8695 (0.9335)  time: 2.1260  data: 0.0002  max mem: 28042
+[06:41:37.222343] Epoch: [0]  [400/812]  lr: 0.000025  grad_norm: 0.9583 (1.1184)  closs: 0.8604 (0.9323)  time: 2.1216  data: 0.0002  max mem: 28042
+[06:41:58.490518] Epoch: [0]  [410/812]  lr: 0.000025  grad_norm: 0.8561 (1.1128)  closs: 0.8604 (0.9309)  time: 2.1228  data: 0.0002  max mem: 28042
+[06:42:19.739979] Epoch: [0]  [420/812]  lr: 0.000026  grad_norm: 0.8646 (1.1088)  closs: 0.8581 (0.9291)  time: 2.1258  data: 0.0002  max mem: 28042
+[06:42:40.820073] Epoch: [0]  [430/812]  lr: 0.000026  grad_norm: 0.8915 (1.1065)  closs: 0.8364 (0.9265)  time: 2.1164  data: 0.0002  max mem: 28042
+[06:43:02.062800] Epoch: [0]  [440/812]  lr: 0.000027  grad_norm: 0.8808 (1.1014)  closs: 0.7983 (0.9241)  time: 2.1161  data: 0.0002  max mem: 28042
+[06:43:23.326377] Epoch: [0]  [450/812]  lr: 0.000028  grad_norm: 0.8808 (1.0998)  closs: 0.8111 (0.9218)  time: 2.1252  data: 0.0002  max mem: 28042
+[06:43:44.565401] Epoch: [0]  [460/812]  lr: 0.000028  grad_norm: 0.8849 (1.0957)  closs: 0.8498 (0.9205)  time: 2.1250  data: 0.0002  max mem: 28042
+[06:44:05.812308] Epoch: [0]  [470/812]  lr: 0.000029  grad_norm: 0.8793 (1.0957)  closs: 0.8498 (0.9200)  time: 2.1242  data: 0.0002  max mem: 28042
+[06:44:27.069455] Epoch: [0]  [480/812]  lr: 0.000030  grad_norm: 0.9377 (1.0929)  closs: 0.9008 (0.9193)  time: 2.1251  data: 0.0002  max mem: 28042
+[06:44:48.177755] Epoch: [0]  [490/812]  lr: 0.000030  grad_norm: 0.9086 (1.0893)  closs: 0.8924 (0.9187)  time: 2.1182  data: 0.0002  max mem: 28042
+[06:45:09.416351] Epoch: [0]  [500/812]  lr: 0.000031  grad_norm: 0.8987 (1.0874)  closs: 0.8698 (0.9174)  time: 2.1173  data: 0.0002  max mem: 28042
+[06:45:30.630868] Epoch: [0]  [510/812]  lr: 0.000031  grad_norm: 0.8835 (1.0869)  closs: 0.8536 (0.9153)  time: 2.1226  data: 0.0002  max mem: 28042
+[06:45:51.904027] Epoch: [0]  [520/812]  lr: 0.000032  grad_norm: 0.9220 (1.0837)  closs: 0.7934 (0.9142)  time: 2.1243  data: 0.0002  max mem: 28042
+[06:46:13.161036] Epoch: [0]  [530/812]  lr: 0.000033  grad_norm: 0.9652 (1.0826)  closs: 0.8786 (0.9135)  time: 2.1264  data: 0.0002  max mem: 28042
+[06:46:34.432979] Epoch: [0]  [540/812]  lr: 0.000033  grad_norm: 0.9935 (1.0810)  closs: 0.9075 (0.9130)  time: 2.1264  data: 0.0002  max mem: 28042
+[06:46:55.524047] Epoch: [0]  [550/812]  lr: 0.000034  grad_norm: 0.9487 (1.0826)  closs: 0.8710 (0.9118)  time: 2.1181  data: 0.0002  max mem: 28042
+[06:47:16.807565] Epoch: [0]  [560/812]  lr: 0.000034  grad_norm: 0.9651 (1.0816)  closs: 0.8440 (0.9113)  time: 2.1186  data: 0.0002  max mem: 28042
+[06:47:38.094895] Epoch: [0]  [570/812]  lr: 0.000035  grad_norm: 0.9088 (1.0788)  closs: 0.8497 (0.9105)  time: 2.1285  data: 0.0002  max mem: 28042
+[06:47:59.329657] Epoch: [0]  [580/812]  lr: 0.000036  grad_norm: 0.9026 (1.0786)  closs: 0.8642 (0.9103)  time: 2.1260  data: 0.0002  max mem: 28042
+[06:48:20.589047] Epoch: [0]  [590/812]  lr: 0.000036  grad_norm: 0.9026 (1.0764)  closs: 0.8642 (0.9095)  time: 2.1246  data: 0.0002  max mem: 28042
+[06:48:41.877780] Epoch: [0]  [600/812]  lr: 0.000037  grad_norm: 0.8705 (1.0731)  closs: 0.8507 (0.9088)  time: 2.1273  data: 0.0002  max mem: 28042
+[06:49:03.085078] Epoch: [0]  [610/812]  lr: 0.000038  grad_norm: 0.8442 (1.0703)  closs: 0.8255 (0.9068)  time: 2.1247  data: 0.0002  max mem: 28042
+[06:49:24.290564] Epoch: [0]  [620/812]  lr: 0.000038  grad_norm: 0.8969 (1.0716)  closs: 0.7882 (0.9062)  time: 2.1206  data: 0.0002  max mem: 28042
+[06:49:45.503873] Epoch: [0]  [630/812]  lr: 0.000039  grad_norm: 0.9548 (1.0711)  closs: 0.8524 (0.9050)  time: 2.1209  data: 0.0002  max mem: 28042
+[06:50:06.759717] Epoch: [0]  [640/812]  lr: 0.000039  grad_norm: 0.9548 (1.0689)  closs: 0.8578 (0.9048)  time: 2.1234  data: 0.0002  max mem: 28042
+[06:50:28.061959] Epoch: [0]  [650/812]  lr: 0.000040  grad_norm: 0.8640 (1.0660)  closs: 0.8476 (0.9035)  time: 2.1278  data: 0.0002  max mem: 28042
+[06:50:49.325746] Epoch: [0]  [660/812]  lr: 0.000041  grad_norm: 0.8640 (1.0636)  closs: 0.8116 (0.9030)  time: 2.1282  data: 0.0002  max mem: 28042
+[06:51:10.485211] Epoch: [0]  [670/812]  lr: 0.000041  grad_norm: 0.8712 (1.0605)  closs: 0.8405 (0.9025)  time: 2.1211  data: 0.0002  max mem: 28042
+[06:51:31.762440] Epoch: [0]  [680/812]  lr: 0.000042  grad_norm: 0.8712 (1.0583)  closs: 0.8536 (0.9021)  time: 2.1218  data: 0.0002  max mem: 28042
+[06:51:53.108436] Epoch: [0]  [690/812]  lr: 0.000042  grad_norm: 0.8425 (1.0562)  closs: 0.8619 (0.9016)  time: 2.1311  data: 0.0002  max mem: 28042
+[06:52:14.345086] Epoch: [0]  [700/812]  lr: 0.000043  grad_norm: 0.8661 (1.0545)  closs: 0.8556 (0.9012)  time: 2.1291  data: 0.0002  max mem: 28042
+[06:52:35.562580] Epoch: [0]  [710/812]  lr: 0.000044  grad_norm: 0.9346 (1.0539)  closs: 0.8271 (0.9002)  time: 2.1226  data: 0.0002  max mem: 28042
+[06:52:56.788771] Epoch: [0]  [720/812]  lr: 0.000044  grad_norm: 0.9284 (1.0529)  closs: 0.8355 (0.8998)  time: 2.1221  data: 0.0002  max mem: 28042
+[06:53:17.889187] Epoch: [0]  [730/812]  lr: 0.000045  grad_norm: 0.9407 (1.0518)  closs: 0.8588 (0.8990)  time: 2.1163  data: 0.0002  max mem: 28042
+[06:53:39.137183] Epoch: [0]  [740/812]  lr: 0.000046  grad_norm: 0.9572 (1.0510)  closs: 0.8287 (0.8978)  time: 2.1173  data: 0.0002  max mem: 28042
+[06:54:00.392567] Epoch: [0]  [750/812]  lr: 0.000046  grad_norm: 0.8869 (1.0493)  closs: 0.8349 (0.8985)  time: 2.1251  data: 0.0002  max mem: 28042
+[06:54:21.698073] Epoch: [0]  [760/812]  lr: 0.000047  grad_norm: 0.8591 (1.0473)  closs: 0.8724 (0.8979)  time: 2.1280  data: 0.0002  max mem: 28042
+[06:54:42.998694] Epoch: [0]  [770/812]  lr: 0.000047  grad_norm: 0.8692 (1.0454)  closs: 0.8389 (0.8970)  time: 2.1302  data: 0.0002  max mem: 28042
+[06:55:04.236494] Epoch: [0]  [780/812]  lr: 0.000048  grad_norm: 0.9315 (1.0442)  closs: 0.8393 (0.8966)  time: 2.1268  data: 0.0002  max mem: 28042
+[06:55:25.352416] Epoch: [0]  [790/812]  lr: 0.000049  grad_norm: 0.8830 (1.0420)  closs: 0.8393 (0.8957)  time: 2.1176  data: 0.0001  max mem: 28042
+[06:55:46.583325] Epoch: [0]  [800/812]  lr: 0.000049  grad_norm: 0.8569 (1.0408)  closs: 0.8439 (0.8953)  time: 2.1173  data: 0.0001  max mem: 28042
+[06:56:07.855749] Epoch: [0]  [810/812]  lr: 0.000050  grad_norm: 0.8814 (1.0395)  closs: 0.8229 (0.8941)  time: 2.1251  data: 0.0001  max mem: 28042
+[06:56:10.209403] Epoch: [0] Total time: 0:28:48
+[06:56:10.212512] Averaged stats: lr: 0.000050  grad_norm: 0.8814 (1.0395)  closs: 0.8177 (0.8988)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[06:56:10.615675] model saved
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[06:56:12.374769] optimizer saved
+[06:56:12.375482] other rank-common saved
+[06:56:12.381202] rank-specific saved
+[06:56:12.391881] log_dir: ./output_dir
+[06:56:15.877787] Epoch: [1]  [0/812]  lr: 0.000050  grad_norm: 0.7418 (0.7418)  closs: 0.6434 (0.6434)  time: 3.4849  data: 1.3481  max mem: 28042
+[06:56:37.172127] Epoch: [1]  [10/812]  lr: 0.000050  grad_norm: 0.9125 (0.9429)  closs: 0.8020 (0.7882)  time: 2.2526  data: 0.1227  max mem: 28042
+[06:56:58.553564] Epoch: [1]  [20/812]  lr: 0.000050  grad_norm: 0.9125 (0.9737)  closs: 0.8134 (0.8498)  time: 2.1337  data: 0.0002  max mem: 28042
+[06:57:19.841517] Epoch: [1]  [30/812]  lr: 0.000050  grad_norm: 0.8671 (0.9519)  closs: 0.8293 (0.8457)  time: 2.1334  data: 0.0002  max mem: 28042
+[06:57:41.138608] Epoch: [1]  [40/812]  lr: 0.000050  grad_norm: 0.8634 (0.9318)  closs: 0.8733 (0.8656)  time: 2.1292  data: 0.0002  max mem: 28042
+[06:58:02.227816] Epoch: [1]  [50/812]  lr: 0.000050  grad_norm: 0.8460 (0.9328)  closs: 0.8800 (0.8613)  time: 2.1192  data: 0.0002  max mem: 28042
+[06:58:23.466948] Epoch: [1]  [60/812]  lr: 0.000050  grad_norm: 0.8895 (0.9279)  closs: 0.8352 (0.8591)  time: 2.1163  data: 0.0002  max mem: 28042
+[06:58:44.680097] Epoch: [1]  [70/812]  lr: 0.000050  grad_norm: 0.9388 (0.9401)  closs: 0.8626 (0.8619)  time: 2.1225  data: 0.0002  max mem: 28042
+[06:59:05.983269] Epoch: [1]  [80/812]  lr: 0.000050  grad_norm: 0.9289 (0.9347)  closs: 0.8646 (0.8577)  time: 2.1257  data: 0.0002  max mem: 28042
+[06:59:27.380557] Epoch: [1]  [90/812]  lr: 0.000050  grad_norm: 0.8478 (0.9265)  closs: 0.8646 (0.8618)  time: 2.1349  data: 0.0002  max mem: 28042
+[06:59:48.640207] Epoch: [1]  [100/812]  lr: 0.000050  grad_norm: 0.8774 (0.9255)  closs: 0.8923 (0.8640)  time: 2.1328  data: 0.0002  max mem: 28042
+[07:00:09.758341] Epoch: [1]  [110/812]  lr: 0.000050  grad_norm: 0.8900 (0.9251)  closs: 0.8698 (0.8612)  time: 2.1188  data: 0.0002  max mem: 28042
+[07:00:30.981964] Epoch: [1]  [120/812]  lr: 0.000050  grad_norm: 0.9182 (0.9269)  closs: 0.8698 (0.8617)  time: 2.1170  data: 0.0002  max mem: 28042
+[07:00:52.245385] Epoch: [1]  [130/812]  lr: 0.000050  grad_norm: 0.9182 (0.9259)  closs: 0.8665 (0.8617)  time: 2.1243  data: 0.0002  max mem: 28042
+[07:01:13.558548] Epoch: [1]  [140/812]  lr: 0.000050  grad_norm: 0.8672 (0.9211)  closs: 0.8531 (0.8614)  time: 2.1287  data: 0.0002  max mem: 28042
+[07:01:34.846520] Epoch: [1]  [150/812]  lr: 0.000050  grad_norm: 0.8460 (0.9249)  closs: 0.8726 (0.8611)  time: 2.1300  data: 0.0002  max mem: 28042
+[07:01:56.152223] Epoch: [1]  [160/812]  lr: 0.000050  grad_norm: 0.9007 (0.9226)  closs: 0.8726 (0.8635)  time: 2.1296  data: 0.0002  max mem: 28042
+[07:02:17.282750] Epoch: [1]  [170/812]  lr: 0.000049  grad_norm: 0.8338 (0.9195)  closs: 0.9167 (0.8660)  time: 2.1217  data: 0.0002  max mem: 28042
+[07:02:38.524204] Epoch: [1]  [180/812]  lr: 0.000049  grad_norm: 0.8527 (0.9223)  closs: 0.8779 (0.8662)  time: 2.1185  data: 0.0002  max mem: 28042
+[07:02:59.827288] Epoch: [1]  [190/812]  lr: 0.000049  grad_norm: 0.8880 (0.9228)  closs: 0.8079 (0.8646)  time: 2.1271  data: 0.0002  max mem: 28042
+[07:03:21.168075] Epoch: [1]  [200/812]  lr: 0.000049  grad_norm: 0.8588 (0.9203)  closs: 0.8388 (0.8627)  time: 2.1321  data: 0.0002  max mem: 28042
+[07:03:42.467849] Epoch: [1]  [210/812]  lr: 0.000049  grad_norm: 0.8416 (0.9177)  closs: 0.8623 (0.8648)  time: 2.1319  data: 0.0002  max mem: 28042
+[07:04:03.731453] Epoch: [1]  [220/812]  lr: 0.000049  grad_norm: 0.8283 (0.9154)  closs: 0.8735 (0.8637)  time: 2.1281  data: 0.0002  max mem: 28042
+[07:04:24.911070] Epoch: [1]  [230/812]  lr: 0.000049  grad_norm: 0.8495 (0.9127)  closs: 0.8099 (0.8620)  time: 2.1221  data: 0.0002  max mem: 28042
+[07:04:46.205620] Epoch: [1]  [240/812]  lr: 0.000049  grad_norm: 0.8495 (0.9115)  closs: 0.8409 (0.8631)  time: 2.1236  data: 0.0002  max mem: 28042
+[07:05:07.516767] Epoch: [1]  [250/812]  lr: 0.000049  grad_norm: 0.8600 (0.9131)  closs: 0.8848 (0.8622)  time: 2.1302  data: 0.0002  max mem: 28042
+[07:05:28.819912] Epoch: [1]  [260/812]  lr: 0.000049  grad_norm: 0.8600 (0.9108)  closs: 0.8859 (0.8627)  time: 2.1306  data: 0.0002  max mem: 28042
+[07:05:50.052732] Epoch: [1]  [270/812]  lr: 0.000049  grad_norm: 0.8403 (0.9102)  closs: 0.8677 (0.8615)  time: 2.1267  data: 0.0002  max mem: 28042
+[07:06:11.298621] Epoch: [1]  [280/812]  lr: 0.000049  grad_norm: 0.9042 (0.9123)  closs: 0.8677 (0.8617)  time: 2.1239  data: 0.0002  max mem: 28042
+[07:06:32.419845] Epoch: [1]  [290/812]  lr: 0.000048  grad_norm: 0.9813 (0.9163)  closs: 0.8537 (0.8620)  time: 2.1183  data: 0.0002  max mem: 28042
+[07:06:53.663078] Epoch: [1]  [300/812]  lr: 0.000048  grad_norm: 0.8997 (0.9148)  closs: 0.8405 (0.8609)  time: 2.1181  data: 0.0002  max mem: 28042
+[07:07:14.922464] Epoch: [1]  [310/812]  lr: 0.000048  grad_norm: 0.8997 (0.9155)  closs: 0.8405 (0.8602)  time: 2.1251  data: 0.0002  max mem: 28042
+[07:07:36.151134] Epoch: [1]  [320/812]  lr: 0.000048  grad_norm: 0.8916 (0.9157)  closs: 0.8480 (0.8609)  time: 2.1243  data: 0.0002  max mem: 28042
+[07:07:57.365292] Epoch: [1]  [330/812]  lr: 0.000048  grad_norm: 0.8389 (0.9149)  closs: 0.8297 (0.8603)  time: 2.1221  data: 0.0002  max mem: 28042
+[07:08:18.608023] Epoch: [1]  [340/812]  lr: 0.000048  grad_norm: 0.8646 (0.9134)  closs: 0.7981 (0.8584)  time: 2.1228  data: 0.0002  max mem: 28042
+[07:08:39.766957] Epoch: [1]  [350/812]  lr: 0.000048  grad_norm: 0.8646 (0.9123)  closs: 0.7981 (0.8579)  time: 2.1200  data: 0.0002  max mem: 28042
+[07:09:01.054096] Epoch: [1]  [360/812]  lr: 0.000048  grad_norm: 0.8671 (0.9124)  closs: 0.7807 (0.8556)  time: 2.1222  data: 0.0002  max mem: 28042
+[07:09:22.343988] Epoch: [1]  [370/812]  lr: 0.000047  grad_norm: 0.9250 (0.9165)  closs: 0.7688 (0.8546)  time: 2.1288  data: 0.0002  max mem: 28042
+[07:09:43.668019] Epoch: [1]  [380/812]  lr: 0.000047  grad_norm: 0.9275 (0.9156)  closs: 0.8322 (0.8548)  time: 2.1306  data: 0.0003  max mem: 28042
+[07:10:04.936534] Epoch: [1]  [390/812]  lr: 0.000047  grad_norm: 0.8581 (0.9324)  closs: 0.8617 (0.8550)  time: 2.1296  data: 0.0003  max mem: 28042
+[07:10:26.269641] Epoch: [1]  [400/812]  lr: 0.000047  grad_norm: 0.8561 (0.9311)  closs: 0.8443 (0.8548)  time: 2.1300  data: 0.0002  max mem: 28042
+[07:10:47.398303] Epoch: [1]  [410/812]  lr: 0.000047  grad_norm: 0.8857 (0.9329)  closs: 0.8655 (0.8563)  time: 2.1230  data: 0.0002  max mem: 28042
+[07:11:08.680671] Epoch: [1]  [420/812]  lr: 0.000047  grad_norm: 0.9295 (0.9312)  closs: 0.9093 (0.8567)  time: 2.1205  data: 0.0002  max mem: 28042
+[07:11:29.894053] Epoch: [1]  [430/812]  lr: 0.000047  grad_norm: 0.8829 (0.9323)  closs: 0.8460 (0.8558)  time: 2.1247  data: 0.0002  max mem: 28042
+[07:11:51.120078] Epoch: [1]  [440/812]  lr: 0.000046  grad_norm: 0.8876 (0.9315)  closs: 0.8460 (0.8558)  time: 2.1219  data: 0.0002  max mem: 28042
+[07:12:12.348027] Epoch: [1]  [450/812]  lr: 0.000046  grad_norm: 0.8868 (0.9304)  closs: 0.8481 (0.8554)  time: 2.1226  data: 0.0002  max mem: 28042
+[07:12:33.569320] Epoch: [1]  [460/812]  lr: 0.000046  grad_norm: 0.9059 (0.9318)  closs: 0.8101 (0.8548)  time: 2.1224  data: 0.0002  max mem: 28042
+[07:12:54.684647] Epoch: [1]  [470/812]  lr: 0.000046  grad_norm: 0.9218 (0.9311)  closs: 0.8178 (0.8549)  time: 2.1168  data: 0.0002  max mem: 28042
+[07:13:15.972916] Epoch: [1]  [480/812]  lr: 0.000046  grad_norm: 0.8772 (0.9306)  closs: 0.8478 (0.8555)  time: 2.1201  data: 0.0002  max mem: 28042
+[07:13:37.223460] Epoch: [1]  [490/812]  lr: 0.000046  grad_norm: 0.8772 (0.9295)  closs: 0.8584 (0.8561)  time: 2.1269  data: 0.0002  max mem: 28042
+[07:13:58.457753] Epoch: [1]  [500/812]  lr: 0.000045  grad_norm: 0.8557 (0.9276)  closs: 0.8375 (0.8556)  time: 2.1242  data: 0.0002  max mem: 28042
+[07:14:19.699206] Epoch: [1]  [510/812]  lr: 0.000045  grad_norm: 0.7918 (0.9274)  closs: 0.8128 (0.8553)  time: 2.1237  data: 0.0002  max mem: 28042
+[07:14:41.023843] Epoch: [1]  [520/812]  lr: 0.000045  grad_norm: 0.8657 (0.9260)  closs: 0.8301 (0.8552)  time: 2.1282  data: 0.0002  max mem: 28042
+[07:15:02.146542] Epoch: [1]  [530/812]  lr: 0.000045  grad_norm: 0.8365 (0.9253)  closs: 0.8398 (0.8553)  time: 2.1223  data: 0.0003  max mem: 28042
+[07:15:23.393129] Epoch: [1]  [540/812]  lr: 0.000045  grad_norm: 0.8390 (0.9242)  closs: 0.8310 (0.8553)  time: 2.1184  data: 0.0003  max mem: 28042
+[07:15:44.662588] Epoch: [1]  [550/812]  lr: 0.000045  grad_norm: 0.8286 (0.9227)  closs: 0.8088 (0.8546)  time: 2.1257  data: 0.0002  max mem: 28042
+[07:16:05.961040] Epoch: [1]  [560/812]  lr: 0.000044  grad_norm: 0.8162 (0.9208)  closs: 0.8267 (0.8545)  time: 2.1283  data: 0.0002  max mem: 28042
+[07:16:27.247815] Epoch: [1]  [570/812]  lr: 0.000044  grad_norm: 0.8102 (0.9196)  closs: 0.8561 (0.8549)  time: 2.1292  data: 0.0002  max mem: 28042
+[07:16:48.505828] Epoch: [1]  [580/812]  lr: 0.000044  grad_norm: 0.8414 (0.9193)  closs: 0.8574 (0.8542)  time: 2.1272  data: 0.0002  max mem: 28042
+[07:17:09.646613] Epoch: [1]  [590/812]  lr: 0.000044  grad_norm: 0.9224 (0.9192)  closs: 0.8602 (0.8545)  time: 2.1199  data: 0.0002  max mem: 28042
+[07:17:30.846001] Epoch: [1]  [600/812]  lr: 0.000044  grad_norm: 0.9177 (0.9188)  closs: 0.8599 (0.8548)  time: 2.1169  data: 0.0002  max mem: 28042
+[07:17:52.084387] Epoch: [1]  [610/812]  lr: 0.000043  grad_norm: 0.8469 (0.9183)  closs: 0.8411 (0.8548)  time: 2.1218  data: 0.0002  max mem: 28042
+[07:18:13.385659] Epoch: [1]  [620/812]  lr: 0.000043  grad_norm: 0.8469 (0.9170)  closs: 0.8457 (0.8548)  time: 2.1269  data: 0.0002  max mem: 28042
+[07:18:34.632870] Epoch: [1]  [630/812]  lr: 0.000043  grad_norm: 0.8292 (0.9164)  closs: 0.8625 (0.8553)  time: 2.1273  data: 0.0002  max mem: 28042
+[07:18:55.965669] Epoch: [1]  [640/812]  lr: 0.000043  grad_norm: 0.8478 (0.9155)  closs: 0.8567 (0.8553)  time: 2.1289  data: 0.0002  max mem: 28042
+[07:19:17.086640] Epoch: [1]  [650/812]  lr: 0.000043  grad_norm: 0.8298 (0.9147)  closs: 0.8326 (0.8550)  time: 2.1226  data: 0.0004  max mem: 28042
+[07:19:38.331115] Epoch: [1]  [660/812]  lr: 0.000042  grad_norm: 0.8200 (0.9143)  closs: 0.8794 (0.8560)  time: 2.1182  data: 0.0004  max mem: 28042
+[07:19:59.582006] Epoch: [1]  [670/812]  lr: 0.000042  grad_norm: 0.8277 (0.9136)  closs: 0.9154 (0.8567)  time: 2.1247  data: 0.0002  max mem: 28042
+[07:20:20.825085] Epoch: [1]  [680/812]  lr: 0.000042  grad_norm: 0.8543 (0.9137)  closs: 0.9115 (0.8573)  time: 2.1246  data: 0.0002  max mem: 28042
+[07:20:42.055735] Epoch: [1]  [690/812]  lr: 0.000042  grad_norm: 0.9597 (0.9188)  closs: 0.9179 (0.8578)  time: 2.1236  data: 0.0003  max mem: 28042
+[07:21:03.290761] Epoch: [1]  [700/812]  lr: 0.000041  grad_norm: 0.8645 (0.9179)  closs: 0.8496 (0.8577)  time: 2.1232  data: 0.0002  max mem: 28042
+[07:21:24.434877] Epoch: [1]  [710/812]  lr: 0.000041  grad_norm: 0.8440 (0.9172)  closs: 0.8131 (0.8574)  time: 2.1189  data: 0.0002  max mem: 28042
+[07:21:45.655921] Epoch: [1]  [720/812]  lr: 0.000041  grad_norm: 0.8724 (0.9173)  closs: 0.7997 (0.8569)  time: 2.1182  data: 0.0002  max mem: 28042
+[07:22:06.935228] Epoch: [1]  [730/812]  lr: 0.000041  grad_norm: 0.8932 (0.9170)  closs: 0.8529 (0.8571)  time: 2.1249  data: 0.0002  max mem: 28042
+[07:22:28.172880] Epoch: [1]  [740/812]  lr: 0.000041  grad_norm: 0.8932 (0.9174)  closs: 0.8546 (0.8569)  time: 2.1258  data: 0.0002  max mem: 28042
+[07:22:49.386611] Epoch: [1]  [750/812]  lr: 0.000040  grad_norm: 0.8588 (0.9167)  closs: 0.8256 (0.8568)  time: 2.1225  data: 0.0002  max mem: 28042
+[07:23:10.577933] Epoch: [1]  [760/812]  lr: 0.000040  grad_norm: 0.8566 (0.9159)  closs: 0.8375 (0.8571)  time: 2.1202  data: 0.0002  max mem: 28042
+[07:23:31.706111] Epoch: [1]  [770/812]  lr: 0.000040  grad_norm: 0.8276 (0.9146)  closs: 0.8624 (0.8566)  time: 2.1159  data: 0.0002  max mem: 28042
+[07:23:53.046796] Epoch: [1]  [780/812]  lr: 0.000040  grad_norm: 0.8238 (0.9141)  closs: 0.8051 (0.8564)  time: 2.1234  data: 0.0002  max mem: 28042
+[07:24:14.279679] Epoch: [1]  [790/812]  lr: 0.000039  grad_norm: 0.8885 (0.9144)  closs: 0.8512 (0.8566)  time: 2.1286  data: 0.0002  max mem: 28042
+[07:24:35.475030] Epoch: [1]  [800/812]  lr: 0.000039  grad_norm: 0.8738 (0.9139)  closs: 0.8451 (0.8562)  time: 2.1213  data: 0.0002  max mem: 28042
+[07:24:56.734344] Epoch: [1]  [810/812]  lr: 0.000039  grad_norm: 0.8738 (0.9154)  closs: 0.8451 (0.8562)  time: 2.1227  data: 0.0002  max mem: 28042
+[07:24:59.165687] Epoch: [1] Total time: 0:28:46
+[07:24:59.168439] Averaged stats: lr: 0.000039  grad_norm: 0.8738 (0.9152)  closs: 0.8673 (0.8545)
+[07:24:59.501598] model saved
+[07:25:01.228520] optimizer saved
+[07:25:01.229115] other rank-common saved
+[07:25:01.232745] rank-specific saved
+[07:25:01.242649] log_dir: ./output_dir
+[07:25:04.605755] Epoch: [2]  [0/812]  lr: 0.000039  grad_norm: 0.8173 (0.8173)  closs: 0.8037 (0.8037)  time: 3.3621  data: 1.2087  max mem: 28042
+[07:25:25.938470] Epoch: [2]  [10/812]  lr: 0.000038  grad_norm: 0.7978 (0.8008)  closs: 0.8347 (0.8409)  time: 2.2449  data: 0.1101  max mem: 28042
+[07:25:47.222423] Epoch: [2]  [20/812]  lr: 0.000038  grad_norm: 0.8004 (0.8113)  closs: 0.8347 (0.8317)  time: 2.1308  data: 0.0002  max mem: 28042
+[07:26:08.393469] Epoch: [2]  [30/812]  lr: 0.000038  grad_norm: 0.8724 (0.8343)  closs: 0.8318 (0.8414)  time: 2.1227  data: 0.0002  max mem: 28042
+[07:26:29.541814] Epoch: [2]  [40/812]  lr: 0.000038  grad_norm: 0.8632 (0.8308)  closs: 0.8318 (0.8407)  time: 2.1159  data: 0.0002  max mem: 28042
+[07:26:50.705595] Epoch: [2]  [50/812]  lr: 0.000037  grad_norm: 0.8126 (0.8463)  closs: 0.8506 (0.8366)  time: 2.1155  data: 0.0002  max mem: 28042
+[07:27:11.930514] Epoch: [2]  [60/812]  lr: 0.000037  grad_norm: 0.8214 (0.8468)  closs: 0.8533 (0.8395)  time: 2.1194  data: 0.0002  max mem: 28042
+[07:27:33.224038] Epoch: [2]  [70/812]  lr: 0.000037  grad_norm: 0.8380 (0.8558)  closs: 0.8522 (0.8441)  time: 2.1258  data: 0.0002  max mem: 28042
+[07:27:54.451669] Epoch: [2]  [80/812]  lr: 0.000037  grad_norm: 0.8503 (0.8651)  closs: 0.8207 (0.8425)  time: 2.1260  data: 0.0002  max mem: 28042
+[07:28:15.680781] Epoch: [2]  [90/812]  lr: 0.000036  grad_norm: 0.8135 (0.8623)  closs: 0.8026 (0.8396)  time: 2.1228  data: 0.0002  max mem: 28042
+[07:28:37.066597] Epoch: [2]  [100/812]  lr: 0.000036  grad_norm: 0.8527 (0.8702)  closs: 0.8127 (0.8409)  time: 2.1307  data: 0.0002  max mem: 28042
+[07:28:58.274244] Epoch: [2]  [110/812]  lr: 0.000036  grad_norm: 0.8377 (0.8658)  closs: 0.8242 (0.8359)  time: 2.1296  data: 0.0002  max mem: 28042
+[07:29:19.632451] Epoch: [2]  [120/812]  lr: 0.000036  grad_norm: 0.8040 (0.8644)  closs: 0.8393 (0.8376)  time: 2.1282  data: 0.0002  max mem: 28042
+[07:29:41.020148] Epoch: [2]  [130/812]  lr: 0.000035  grad_norm: 0.8352 (0.8648)  closs: 0.8316 (0.8367)  time: 2.1372  data: 0.0002  max mem: 28042
+[07:30:02.395455] Epoch: [2]  [140/812]  lr: 0.000035  grad_norm: 0.8786 (0.8695)  closs: 0.8135 (0.8362)  time: 2.1381  data: 0.0002  max mem: 28042
+[07:30:23.808228] Epoch: [2]  [150/812]  lr: 0.000035  grad_norm: 0.8822 (0.8688)  closs: 0.8453 (0.8374)  time: 2.1393  data: 0.0002  max mem: 28042
+[07:30:45.164764] Epoch: [2]  [160/812]  lr: 0.000035  grad_norm: 0.8822 (0.8746)  closs: 0.8842 (0.8382)  time: 2.1384  data: 0.0002  max mem: 28042
+[07:31:06.315458] Epoch: [2]  [170/812]  lr: 0.000034  grad_norm: 0.8529 (0.8701)  closs: 0.8118 (0.8371)  time: 2.1253  data: 0.0002  max mem: 28042
+[07:31:27.656439] Epoch: [2]  [180/812]  lr: 0.000034  grad_norm: 0.8679 (0.8743)  closs: 0.8118 (0.8369)  time: 2.1245  data: 0.0002  max mem: 28042
+[07:31:49.023759] Epoch: [2]  [190/812]  lr: 0.000034  grad_norm: 0.8803 (0.8730)  closs: 0.8327 (0.8381)  time: 2.1353  data: 0.0002  max mem: 28042
+[07:32:10.391465] Epoch: [2]  [200/812]  lr: 0.000033  grad_norm: 0.8540 (0.8725)  closs: 0.8327 (0.8392)  time: 2.1367  data: 0.0002  max mem: 28042
+[07:32:31.801438] Epoch: [2]  [210/812]  lr: 0.000033  grad_norm: 0.8493 (0.8724)  closs: 0.8227 (0.8396)  time: 2.1388  data: 0.0002  max mem: 28042
+[07:32:53.101914] Epoch: [2]  [220/812]  lr: 0.000033  grad_norm: 0.8557 (0.8723)  closs: 0.8429 (0.8411)  time: 2.1354  data: 0.0003  max mem: 28042
+[07:33:14.125610] Epoch: [2]  [230/812]  lr: 0.000033  grad_norm: 0.8792 (0.8748)  closs: 0.8429 (0.8417)  time: 2.1161  data: 0.0003  max mem: 28042
+[07:33:35.442382] Epoch: [2]  [240/812]  lr: 0.000032  grad_norm: 0.8701 (0.8736)  closs: 0.8309 (0.8416)  time: 2.1169  data: 0.0002  max mem: 28042
+[07:33:56.828157] Epoch: [2]  [250/812]  lr: 0.000032  grad_norm: 0.8521 (0.8742)  closs: 0.8317 (0.8422)  time: 2.1351  data: 0.0002  max mem: 28042
+[07:34:18.204872] Epoch: [2]  [260/812]  lr: 0.000032  grad_norm: 0.8521 (0.8736)  closs: 0.7921 (0.8415)  time: 2.1380  data: 0.0002  max mem: 28042
+[07:34:39.611093] Epoch: [2]  [270/812]  lr: 0.000031  grad_norm: 0.7877 (0.8718)  closs: 0.7921 (0.8428)  time: 2.1391  data: 0.0002  max mem: 28042
+[07:35:00.999360] Epoch: [2]  [280/812]  lr: 0.000031  grad_norm: 0.8320 (0.8718)  closs: 0.8277 (0.8442)  time: 2.1396  data: 0.0002  max mem: 28042
+[07:35:22.112658] Epoch: [2]  [290/812]  lr: 0.000031  grad_norm: 0.8661 (0.8723)  closs: 0.8074 (0.8410)  time: 2.1250  data: 0.0002  max mem: 28042
+[07:35:43.467764] Epoch: [2]  [300/812]  lr: 0.000031  grad_norm: 0.8647 (0.8730)  closs: 0.8054 (0.8405)  time: 2.1233  data: 0.0002  max mem: 28042
+[07:36:04.824780] Epoch: [2]  [310/812]  lr: 0.000030  grad_norm: 0.8826 (0.8761)  closs: 0.8440 (0.8406)  time: 2.1355  data: 0.0002  max mem: 28042
+[07:36:26.170930] Epoch: [2]  [320/812]  lr: 0.000030  grad_norm: 0.8778 (0.8759)  closs: 0.8422 (0.8418)  time: 2.1351  data: 0.0002  max mem: 28042
+[07:36:47.504093] Epoch: [2]  [330/812]  lr: 0.000030  grad_norm: 0.8626 (0.8788)  closs: 0.8727 (0.8434)  time: 2.1339  data: 0.0002  max mem: 28042
+[07:37:08.880216] Epoch: [2]  [340/812]  lr: 0.000029  grad_norm: 0.8626 (0.8780)  closs: 0.8866 (0.8442)  time: 2.1354  data: 0.0002  max mem: 28042
+[07:37:30.026320] Epoch: [2]  [350/812]  lr: 0.000029  grad_norm: 0.8242 (0.8770)  closs: 0.8389 (0.8438)  time: 2.1260  data: 0.0002  max mem: 28042
+[07:37:51.342100] Epoch: [2]  [360/812]  lr: 0.000029  grad_norm: 0.8242 (0.8763)  closs: 0.8495 (0.8446)  time: 2.1230  data: 0.0002  max mem: 28042
+[07:38:12.672288] Epoch: [2]  [370/812]  lr: 0.000029  grad_norm: 0.8642 (0.8774)  closs: 0.8439 (0.8445)  time: 2.1322  data: 0.0002  max mem: 28042
+[07:38:34.038903] Epoch: [2]  [380/812]  lr: 0.000028  grad_norm: 0.8655 (0.8792)  closs: 0.8376 (0.8437)  time: 2.1348  data: 0.0002  max mem: 28042
+[07:38:55.416450] Epoch: [2]  [390/812]  lr: 0.000028  grad_norm: 0.8655 (0.8799)  closs: 0.8200 (0.8424)  time: 2.1371  data: 0.0002  max mem: 28042
+[07:39:16.805648] Epoch: [2]  [400/812]  lr: 0.000028  grad_norm: 0.9245 (0.8820)  closs: 0.8217 (0.8425)  time: 2.1383  data: 0.0002  max mem: 28042
+[07:39:37.947526] Epoch: [2]  [410/812]  lr: 0.000027  grad_norm: 0.8268 (0.8809)  closs: 0.8217 (0.8423)  time: 2.1265  data: 0.0002  max mem: 28042
+[07:39:59.300724] Epoch: [2]  [420/812]  lr: 0.000027  grad_norm: 0.7987 (0.8815)  closs: 0.8043 (0.8423)  time: 2.1247  data: 0.0004  max mem: 28042
+[07:40:20.670550] Epoch: [2]  [430/812]  lr: 0.000027  grad_norm: 0.8388 (0.8824)  closs: 0.8112 (0.8429)  time: 2.1361  data: 0.0004  max mem: 28042
+[07:40:42.075112] Epoch: [2]  [440/812]  lr: 0.000027  grad_norm: 0.8653 (0.8825)  closs: 0.8614 (0.8427)  time: 2.1386  data: 0.0002  max mem: 28042
+[07:41:03.398003] Epoch: [2]  [450/812]  lr: 0.000026  grad_norm: 0.8892 (0.8833)  closs: 0.8563 (0.8438)  time: 2.1363  data: 0.0002  max mem: 28042
+[07:41:24.767838] Epoch: [2]  [460/812]  lr: 0.000026  grad_norm: 0.8508 (0.8832)  closs: 0.9083 (0.8458)  time: 2.1346  data: 0.0002  max mem: 28042
+[07:41:45.899249] Epoch: [2]  [470/812]  lr: 0.000026  grad_norm: 0.8296 (0.8824)  closs: 0.8864 (0.8453)  time: 2.1250  data: 0.0002  max mem: 28042
+[07:42:07.234898] Epoch: [2]  [480/812]  lr: 0.000025  grad_norm: 0.8476 (0.8834)  closs: 0.8653 (0.8465)  time: 2.1233  data: 0.0002  max mem: 28042
+[07:42:28.526171] Epoch: [2]  [490/812]  lr: 0.000025  grad_norm: 0.8476 (0.8827)  closs: 0.8379 (0.8457)  time: 2.1313  data: 0.0002  max mem: 28042
+[07:42:49.834652] Epoch: [2]  [500/812]  lr: 0.000025  grad_norm: 0.8437 (0.8834)  closs: 0.8202 (0.8453)  time: 2.1299  data: 0.0002  max mem: 28042
+[07:43:11.253722] Epoch: [2]  [510/812]  lr: 0.000024  grad_norm: 0.8629 (0.8832)  closs: 0.8411 (0.8454)  time: 2.1363  data: 0.0002  max mem: 28042
+[07:43:32.569350] Epoch: [2]  [520/812]  lr: 0.000024  grad_norm: 0.8694 (0.8837)  closs: 0.8411 (0.8454)  time: 2.1366  data: 0.0002  max mem: 28042
+[07:43:53.723009] Epoch: [2]  [530/812]  lr: 0.000024  grad_norm: 0.8694 (0.8828)  closs: 0.8377 (0.8456)  time: 2.1234  data: 0.0002  max mem: 28042
+[07:44:15.068725] Epoch: [2]  [540/812]  lr: 0.000024  grad_norm: 0.8512 (0.8832)  closs: 0.8377 (0.8454)  time: 2.1249  data: 0.0002  max mem: 28042
+[07:44:36.418823] Epoch: [2]  [550/812]  lr: 0.000023  grad_norm: 0.8541 (0.8821)  closs: 0.8485 (0.8453)  time: 2.1347  data: 0.0002  max mem: 28042
+[07:44:57.735533] Epoch: [2]  [560/812]  lr: 0.000023  grad_norm: 0.7920 (0.8821)  closs: 0.8278 (0.8453)  time: 2.1333  data: 0.0002  max mem: 28042
+[07:45:19.044480] Epoch: [2]  [570/812]  lr: 0.000023  grad_norm: 0.8350 (0.8831)  closs: 0.8197 (0.8447)  time: 2.1312  data: 0.0002  max mem: 28042
+[07:45:40.336507] Epoch: [2]  [580/812]  lr: 0.000022  grad_norm: 0.9544 (0.8832)  closs: 0.8045 (0.8436)  time: 2.1300  data: 0.0002  max mem: 28042
+[07:46:01.445076] Epoch: [2]  [590/812]  lr: 0.000022  grad_norm: 0.8539 (0.8832)  closs: 0.8222 (0.8426)  time: 2.1200  data: 0.0002  max mem: 28042
+[07:46:22.758448] Epoch: [2]  [600/812]  lr: 0.000022  grad_norm: 0.8545 (0.8826)  closs: 0.8357 (0.8426)  time: 2.1210  data: 0.0002  max mem: 28042
+[07:46:43.975732] Epoch: [2]  [610/812]  lr: 0.000022  grad_norm: 0.8283 (0.8824)  closs: 0.8244 (0.8424)  time: 2.1265  data: 0.0002  max mem: 28042
+[07:47:05.349197] Epoch: [2]  [620/812]  lr: 0.000021  grad_norm: 0.8456 (0.8825)  closs: 0.8244 (0.8423)  time: 2.1295  data: 0.0002  max mem: 28042
+[07:47:26.735767] Epoch: [2]  [630/812]  lr: 0.000021  grad_norm: 0.8456 (0.8814)  closs: 0.8121 (0.8423)  time: 2.1379  data: 0.0002  max mem: 28042
+[07:47:48.056890] Epoch: [2]  [640/812]  lr: 0.000021  grad_norm: 0.8785 (0.8820)  closs: 0.8153 (0.8423)  time: 2.1353  data: 0.0002  max mem: 28042
+[07:48:09.177512] Epoch: [2]  [650/812]  lr: 0.000021  grad_norm: 0.8796 (0.8835)  closs: 0.7759 (0.8408)  time: 2.1220  data: 0.0002  max mem: 28042
+[07:48:30.537949] Epoch: [2]  [660/812]  lr: 0.000020  grad_norm: 0.8254 (0.8828)  closs: 0.7714 (0.8401)  time: 2.1240  data: 0.0002  max mem: 28042
+[07:48:51.921824] Epoch: [2]  [670/812]  lr: 0.000020  grad_norm: 0.8632 (0.8835)  closs: 0.7948 (0.8396)  time: 2.1371  data: 0.0002  max mem: 28042
+[07:49:13.251435] Epoch: [2]  [680/812]  lr: 0.000020  grad_norm: 0.8754 (0.8834)  closs: 0.8484 (0.8402)  time: 2.1356  data: 0.0002  max mem: 28042
+[07:49:34.579325] Epoch: [2]  [690/812]  lr: 0.000019  grad_norm: 0.8018 (0.8829)  closs: 0.8826 (0.8405)  time: 2.1328  data: 0.0002  max mem: 28042
+[07:49:55.977559] Epoch: [2]  [700/812]  lr: 0.000019  grad_norm: 0.8168 (0.8827)  closs: 0.8405 (0.8403)  time: 2.1362  data: 0.0002  max mem: 28042
+[07:50:17.140172] Epoch: [2]  [710/812]  lr: 0.000019  grad_norm: 0.8241 (0.8818)  closs: 0.7813 (0.8396)  time: 2.1280  data: 0.0002  max mem: 28042
+[07:50:38.489617] Epoch: [2]  [720/812]  lr: 0.000019  grad_norm: 0.7747 (0.8815)  closs: 0.7818 (0.8397)  time: 2.1255  data: 0.0002  max mem: 28042
+[07:50:59.854958] Epoch: [2]  [730/812]  lr: 0.000018  grad_norm: 0.8660 (0.8817)  closs: 0.8288 (0.8405)  time: 2.1357  data: 0.0002  max mem: 28042
+[07:51:21.193447] Epoch: [2]  [740/812]  lr: 0.000018  grad_norm: 0.8745 (0.8813)  closs: 0.8010 (0.8404)  time: 2.1351  data: 0.0002  max mem: 28042
+[07:51:42.504050] Epoch: [2]  [750/812]  lr: 0.000018  grad_norm: 0.8117 (0.8810)  closs: 0.8010 (0.8404)  time: 2.1324  data: 0.0002  max mem: 28042
+[07:52:03.789461] Epoch: [2]  [760/812]  lr: 0.000018  grad_norm: 0.8396 (0.8808)  closs: 0.8350 (0.8399)  time: 2.1297  data: 0.0002  max mem: 28042
+[07:52:24.867624] Epoch: [2]  [770/812]  lr: 0.000017  grad_norm: 0.8502 (0.8811)  closs: 0.8475 (0.8405)  time: 2.1181  data: 0.0002  max mem: 28042
+[07:52:46.212501] Epoch: [2]  [780/812]  lr: 0.000017  grad_norm: 0.8794 (0.8809)  closs: 0.8875 (0.8410)  time: 2.1211  data: 0.0002  max mem: 28042
+[07:53:07.550117] Epoch: [2]  [790/812]  lr: 0.000017  grad_norm: 0.7999 (0.8805)  closs: 0.8674 (0.8414)  time: 2.1341  data: 0.0001  max mem: 28042
+[07:53:28.932999] Epoch: [2]  [800/812]  lr: 0.000017  grad_norm: 0.8289 (0.8808)  closs: 0.8283 (0.8416)  time: 2.1359  data: 0.0001  max mem: 28042
+[07:53:50.278871] Epoch: [2]  [810/812]  lr: 0.000016  grad_norm: 0.8302 (0.8809)  closs: 0.8504 (0.8421)  time: 2.1364  data: 0.0001  max mem: 28042
+[07:53:52.714588] Epoch: [2] Total time: 0:28:51
+[07:53:52.730137] Averaged stats: lr: 0.000016  grad_norm: 0.8289 (0.8807)  closs: 0.8552 (0.8424)
+[07:53:53.203661] model saved
+[07:53:54.901871] optimizer saved
+[07:53:54.902645] other rank-common saved
+[07:53:54.906903] rank-specific saved
+[07:53:54.917520] log_dir: ./output_dir
+[07:53:58.191492] Epoch: [3]  [0/812]  lr: 0.000016  grad_norm: 1.0490 (1.0490)  closs: 0.6441 (0.6441)  time: 3.2729  data: 1.1646  max mem: 28042
+[07:54:19.479554] Epoch: [3]  [10/812]  lr: 0.000016  grad_norm: 0.8902 (0.9635)  closs: 0.9038 (0.8853)  time: 2.2327  data: 0.1061  max mem: 28042
+[07:54:40.626378] Epoch: [3]  [20/812]  lr: 0.000016  grad_norm: 0.8645 (0.9616)  closs: 0.8700 (0.8582)  time: 2.1217  data: 0.0002  max mem: 28042
+[07:55:01.731665] Epoch: [3]  [30/812]  lr: 0.000016  grad_norm: 0.8400 (0.9256)  closs: 0.8400 (0.8562)  time: 2.1125  data: 0.0002  max mem: 28042
+[07:55:22.885001] Epoch: [3]  [40/812]  lr: 0.000015  grad_norm: 0.8370 (0.9113)  closs: 0.8383 (0.8448)  time: 2.1129  data: 0.0002  max mem: 28042
+[07:55:44.040122] Epoch: [3]  [50/812]  lr: 0.000015  grad_norm: 0.8304 (0.8985)  closs: 0.8214 (0.8448)  time: 2.1153  data: 0.0002  max mem: 28042
+[07:56:05.256778] Epoch: [3]  [60/812]  lr: 0.000015  grad_norm: 0.8304 (0.9031)  closs: 0.8439 (0.8496)  time: 2.1185  data: 0.0002  max mem: 28042
+[07:56:26.408434] Epoch: [3]  [70/812]  lr: 0.000015  grad_norm: 0.8670 (0.9090)  closs: 0.8136 (0.8462)  time: 2.1183  data: 0.0002  max mem: 28042
+[07:56:47.651858] Epoch: [3]  [80/812]  lr: 0.000014  grad_norm: 0.8294 (0.9039)  closs: 0.8295 (0.8458)  time: 2.1197  data: 0.0002  max mem: 28042
+[07:57:08.789407] Epoch: [3]  [90/812]  lr: 0.000014  grad_norm: 0.8278 (0.9153)  closs: 0.8492 (0.8447)  time: 2.1190  data: 0.0002  max mem: 28042
+[07:57:29.972419] Epoch: [3]  [100/812]  lr: 0.000014  grad_norm: 0.8774 (0.9115)  closs: 0.8209 (0.8421)  time: 2.1159  data: 0.0002  max mem: 28042
+[07:57:51.294111] Epoch: [3]  [110/812]  lr: 0.000014  grad_norm: 0.8553 (0.9095)  closs: 0.7868 (0.8363)  time: 2.1252  data: 0.0002  max mem: 28042
+[07:58:12.671601] Epoch: [3]  [120/812]  lr: 0.000013  grad_norm: 0.8556 (0.9050)  closs: 0.7868 (0.8408)  time: 2.1349  data: 0.0003  max mem: 28042
+[07:58:33.964903] Epoch: [3]  [130/812]  lr: 0.000013  grad_norm: 0.8585 (0.9045)  closs: 0.8151 (0.8379)  time: 2.1335  data: 0.0003  max mem: 28042
+[07:58:55.294678] Epoch: [3]  [140/812]  lr: 0.000013  grad_norm: 0.8542 (0.9024)  closs: 0.8471 (0.8414)  time: 2.1311  data: 0.0002  max mem: 28042
+[07:59:16.596797] Epoch: [3]  [150/812]  lr: 0.000013  grad_norm: 0.8433 (0.9080)  closs: 0.8486 (0.8407)  time: 2.1315  data: 0.0002  max mem: 28042
+[07:59:37.813975] Epoch: [3]  [160/812]  lr: 0.000012  grad_norm: 0.8266 (0.9031)  closs: 0.7934 (0.8408)  time: 2.1259  data: 0.0002  max mem: 28042
+[07:59:59.101368] Epoch: [3]  [170/812]  lr: 0.000012  grad_norm: 0.8266 (0.9000)  closs: 0.8247 (0.8395)  time: 2.1252  data: 0.0002  max mem: 28042
+[08:00:20.346641] Epoch: [3]  [180/812]  lr: 0.000012  grad_norm: 0.8709 (0.9001)  closs: 0.8101 (0.8379)  time: 2.1266  data: 0.0002  max mem: 28042
+[08:00:41.591960] Epoch: [3]  [190/812]  lr: 0.000012  grad_norm: 0.9020 (0.9010)  closs: 0.8010 (0.8377)  time: 2.1245  data: 0.0002  max mem: 28042
+[08:01:02.880467] Epoch: [3]  [200/812]  lr: 0.000012  grad_norm: 0.8726 (0.8965)  closs: 0.8493 (0.8391)  time: 2.1266  data: 0.0002  max mem: 28042
+[08:01:24.071233] Epoch: [3]  [210/812]  lr: 0.000011  grad_norm: 0.7843 (0.8941)  closs: 0.8166 (0.8381)  time: 2.1239  data: 0.0002  max mem: 28042
+[08:01:45.236262] Epoch: [3]  [220/812]  lr: 0.000011  grad_norm: 0.8370 (0.8924)  closs: 0.8164 (0.8396)  time: 2.1177  data: 0.0002  max mem: 28042
+[08:02:06.489504] Epoch: [3]  [230/812]  lr: 0.000011  grad_norm: 0.8645 (0.8946)  closs: 0.8308 (0.8402)  time: 2.1208  data: 0.0002  max mem: 28042
+[08:02:27.784203] Epoch: [3]  [240/812]  lr: 0.000011  grad_norm: 0.8777 (0.8973)  closs: 0.8308 (0.8405)  time: 2.1273  data: 0.0002  max mem: 28042
+[08:02:49.078010] Epoch: [3]  [250/812]  lr: 0.000011  grad_norm: 0.8627 (0.8957)  closs: 0.8112 (0.8399)  time: 2.1293  data: 0.0002  max mem: 28042
+[08:03:10.423309] Epoch: [3]  [260/812]  lr: 0.000010  grad_norm: 0.7976 (0.8924)  closs: 0.7705 (0.8383)  time: 2.1319  data: 0.0002  max mem: 28042
+[08:03:31.704948] Epoch: [3]  [270/812]  lr: 0.000010  grad_norm: 0.8373 (0.8915)  closs: 0.7730 (0.8388)  time: 2.1313  data: 0.0002  max mem: 28042
+[08:03:52.854770] Epoch: [3]  [280/812]  lr: 0.000010  grad_norm: 0.8283 (0.8906)  closs: 0.7952 (0.8401)  time: 2.1215  data: 0.0002  max mem: 28042
+[08:04:14.113366] Epoch: [3]  [290/812]  lr: 0.000010  grad_norm: 0.8470 (0.8935)  closs: 0.7852 (0.8392)  time: 2.1203  data: 0.0002  max mem: 28042
+[08:04:35.342838] Epoch: [3]  [300/812]  lr: 0.000010  grad_norm: 0.9499 (0.8946)  closs: 0.7852 (0.8366)  time: 2.1243  data: 0.0003  max mem: 28042
+[08:04:56.618987] Epoch: [3]  [310/812]  lr: 0.000010  grad_norm: 0.9211 (0.8958)  closs: 0.8185 (0.8384)  time: 2.1252  data: 0.0003  max mem: 28042
+[08:05:17.910768] Epoch: [3]  [320/812]  lr: 0.000009  grad_norm: 0.8922 (0.8954)  closs: 0.8641 (0.8389)  time: 2.1283  data: 0.0003  max mem: 28042
+[08:05:39.221330] Epoch: [3]  [330/812]  lr: 0.000009  grad_norm: 0.8283 (0.8942)  closs: 0.8408 (0.8387)  time: 2.1300  data: 0.0002  max mem: 28042
+[08:06:00.377654] Epoch: [3]  [340/812]  lr: 0.000009  grad_norm: 0.8666 (0.8945)  closs: 0.8742 (0.8393)  time: 2.1233  data: 0.0002  max mem: 28042
+[08:06:21.664630] Epoch: [3]  [350/812]  lr: 0.000009  grad_norm: 0.8586 (0.8937)  closs: 0.8742 (0.8399)  time: 2.1221  data: 0.0002  max mem: 28042
+[08:06:42.936458] Epoch: [3]  [360/812]  lr: 0.000009  grad_norm: 0.8542 (0.8931)  closs: 0.8490 (0.8404)  time: 2.1279  data: 0.0002  max mem: 28042
+[08:07:04.181192] Epoch: [3]  [370/812]  lr: 0.000009  grad_norm: 0.8234 (0.8933)  closs: 0.8577 (0.8406)  time: 2.1257  data: 0.0002  max mem: 28042
+[08:07:25.478197] Epoch: [3]  [380/812]  lr: 0.000008  grad_norm: 0.8222 (0.8926)  closs: 0.8238 (0.8396)  time: 2.1270  data: 0.0002  max mem: 28042
+[08:07:46.790628] Epoch: [3]  [390/812]  lr: 0.000008  grad_norm: 0.8357 (0.8927)  closs: 0.7723 (0.8386)  time: 2.1304  data: 0.0002  max mem: 28042
+[08:08:07.856380] Epoch: [3]  [400/812]  lr: 0.000008  grad_norm: 0.8908 (0.8945)  closs: 0.8394 (0.8390)  time: 2.1188  data: 0.0002  max mem: 28042
+[08:08:29.129808] Epoch: [3]  [410/812]  lr: 0.000008  grad_norm: 0.8908 (0.8952)  closs: 0.8187 (0.8376)  time: 2.1169  data: 0.0002  max mem: 28042
+[08:08:50.389711] Epoch: [3]  [420/812]  lr: 0.000008  grad_norm: 0.8459 (0.8954)  closs: 0.7789 (0.8370)  time: 2.1266  data: 0.0002  max mem: 28042
+[08:09:11.665127] Epoch: [3]  [430/812]  lr: 0.000008  grad_norm: 0.8785 (0.8963)  closs: 0.7809 (0.8356)  time: 2.1267  data: 0.0002  max mem: 28042
+[08:09:32.923390] Epoch: [3]  [440/812]  lr: 0.000008  grad_norm: 0.8872 (0.8970)  closs: 0.7809 (0.8346)  time: 2.1266  data: 0.0002  max mem: 28042
+[08:09:54.151497] Epoch: [3]  [450/812]  lr: 0.000007  grad_norm: 0.8938 (0.8975)  closs: 0.7968 (0.8343)  time: 2.1242  data: 0.0002  max mem: 28042
+[08:10:15.305060] Epoch: [3]  [460/812]  lr: 0.000007  grad_norm: 0.8952 (0.9105)  closs: 0.8061 (0.8337)  time: 2.1190  data: 0.0002  max mem: 28042
+[08:10:36.576963] Epoch: [3]  [470/812]  lr: 0.000007  grad_norm: 0.8742 (0.9107)  closs: 0.7998 (0.8334)  time: 2.1212  data: 0.0002  max mem: 28042
+[08:10:57.844902] Epoch: [3]  [480/812]  lr: 0.000007  grad_norm: 0.8667 (0.9107)  closs: 0.7924 (0.8329)  time: 2.1269  data: 0.0002  max mem: 28042
+[08:11:19.115588] Epoch: [3]  [490/812]  lr: 0.000007  grad_norm: 0.8655 (0.9100)  closs: 0.7802 (0.8330)  time: 2.1269  data: 0.0002  max mem: 28042
+[08:11:40.448298] Epoch: [3]  [500/812]  lr: 0.000007  grad_norm: 0.8168 (0.9101)  closs: 0.8021 (0.8326)  time: 2.1301  data: 0.0005  max mem: 28042
+[08:12:01.736491] Epoch: [3]  [510/812]  lr: 0.000007  grad_norm: 0.8157 (0.9084)  closs: 0.8268 (0.8339)  time: 2.1310  data: 0.0004  max mem: 28042
+[08:12:22.852744] Epoch: [3]  [520/812]  lr: 0.000007  grad_norm: 0.8698 (0.9092)  closs: 0.9299 (0.8346)  time: 2.1201  data: 0.0002  max mem: 28042
+[08:12:44.116451] Epoch: [3]  [530/812]  lr: 0.000006  grad_norm: 0.8853 (0.9092)  closs: 0.7947 (0.8340)  time: 2.1189  data: 0.0002  max mem: 28042
+[08:13:05.415617] Epoch: [3]  [540/812]  lr: 0.000006  grad_norm: 0.8769 (0.9088)  closs: 0.7747 (0.8337)  time: 2.1281  data: 0.0002  max mem: 28042
+[08:13:26.702689] Epoch: [3]  [550/812]  lr: 0.000006  grad_norm: 0.8467 (0.9087)  closs: 0.8074 (0.8329)  time: 2.1292  data: 0.0002  max mem: 28042
+[08:13:47.965371] Epoch: [3]  [560/812]  lr: 0.000006  grad_norm: 0.8314 (0.9073)  closs: 0.8062 (0.8323)  time: 2.1274  data: 0.0002  max mem: 28042
+[08:14:09.281838] Epoch: [3]  [570/812]  lr: 0.000006  grad_norm: 0.8763 (0.9081)  closs: 0.8191 (0.8329)  time: 2.1289  data: 0.0002  max mem: 28042
+[08:14:30.423561] Epoch: [3]  [580/812]  lr: 0.000006  grad_norm: 0.9098 (0.9078)  closs: 0.8414 (0.8333)  time: 2.1228  data: 0.0002  max mem: 28042
+[08:14:51.732903] Epoch: [3]  [590/812]  lr: 0.000006  grad_norm: 0.8236 (0.9058)  closs: 0.8080 (0.8328)  time: 2.1225  data: 0.0002  max mem: 28042
+[08:15:13.032032] Epoch: [3]  [600/812]  lr: 0.000006  grad_norm: 0.8071 (0.9055)  closs: 0.7610 (0.8326)  time: 2.1303  data: 0.0002  max mem: 28042
+[08:15:34.338684] Epoch: [3]  [610/812]  lr: 0.000006  grad_norm: 0.8387 (0.9068)  closs: 0.8489 (0.8333)  time: 2.1302  data: 0.0002  max mem: 28042
+[08:15:55.610535] Epoch: [3]  [620/812]  lr: 0.000006  grad_norm: 0.8382 (0.9067)  closs: 0.8438 (0.8335)  time: 2.1289  data: 0.0002  max mem: 28042
+[08:16:16.889308] Epoch: [3]  [630/812]  lr: 0.000006  grad_norm: 0.8850 (0.9083)  closs: 0.7955 (0.8327)  time: 2.1275  data: 0.0002  max mem: 28042
+[08:16:38.059748] Epoch: [3]  [640/812]  lr: 0.000006  grad_norm: 0.8985 (0.9078)  closs: 0.7955 (0.8326)  time: 2.1224  data: 0.0002  max mem: 28042
+[08:16:59.340361] Epoch: [3]  [650/812]  lr: 0.000005  grad_norm: 0.8872 (0.9074)  closs: 0.8339 (0.8337)  time: 2.1225  data: 0.0002  max mem: 28042
+[08:17:20.632265] Epoch: [3]  [660/812]  lr: 0.000005  grad_norm: 0.8939 (0.9068)  closs: 0.8938 (0.8344)  time: 2.1286  data: 0.0002  max mem: 28042
+[08:17:41.842573] Epoch: [3]  [670/812]  lr: 0.000005  grad_norm: 0.8939 (0.9067)  closs: 0.8269 (0.8338)  time: 2.1250  data: 0.0002  max mem: 28042
+[08:18:03.079831] Epoch: [3]  [680/812]  lr: 0.000005  grad_norm: 0.8363 (0.9068)  closs: 0.8251 (0.8344)  time: 2.1223  data: 0.0002  max mem: 28042
+[08:18:24.338829] Epoch: [3]  [690/812]  lr: 0.000005  grad_norm: 0.8847 (0.9081)  closs: 0.8305 (0.8341)  time: 2.1247  data: 0.0002  max mem: 28042
+[08:18:45.483996] Epoch: [3]  [700/812]  lr: 0.000005  grad_norm: 0.8847 (0.9070)  closs: 0.8262 (0.8343)  time: 2.1201  data: 0.0002  max mem: 28042
+[08:19:06.675603] Epoch: [3]  [710/812]  lr: 0.000005  grad_norm: 0.8213 (0.9068)  closs: 0.8601 (0.8347)  time: 2.1168  data: 0.0002  max mem: 28042
+[08:19:27.902064] Epoch: [3]  [720/812]  lr: 0.000005  grad_norm: 0.8174 (0.9057)  closs: 0.8440 (0.8350)  time: 2.1208  data: 0.0002  max mem: 28042
+[08:19:49.131969] Epoch: [3]  [730/812]  lr: 0.000005  grad_norm: 0.8260 (0.9056)  closs: 0.8061 (0.8343)  time: 2.1227  data: 0.0002  max mem: 28042
+[08:20:10.427471] Epoch: [3]  [740/812]  lr: 0.000005  grad_norm: 0.8716 (0.9058)  closs: 0.8154 (0.8347)  time: 2.1262  data: 0.0002  max mem: 28042
+[08:20:31.811740] Epoch: [3]  [750/812]  lr: 0.000005  grad_norm: 0.8902 (0.9072)  closs: 0.8183 (0.8343)  time: 2.1339  data: 0.0002  max mem: 28042
+[08:20:52.966245] Epoch: [3]  [760/812]  lr: 0.000005  grad_norm: 0.8902 (0.9094)  closs: 0.8189 (0.8348)  time: 2.1269  data: 0.0002  max mem: 28042
+[08:21:14.257344] Epoch: [3]  [770/812]  lr: 0.000005  grad_norm: 0.8943 (0.9086)  closs: 0.8266 (0.8348)  time: 2.1222  data: 0.0002  max mem: 28042
+[08:21:35.557339] Epoch: [3]  [780/812]  lr: 0.000005  grad_norm: 0.8404 (0.9086)  closs: 0.8143 (0.8347)  time: 2.1295  data: 0.0002  max mem: 28042
+[08:21:56.831315] Epoch: [3]  [790/812]  lr: 0.000005  grad_norm: 0.8876 (0.9092)  closs: 0.8193 (0.8347)  time: 2.1286  data: 0.0002  max mem: 28042
+[08:22:18.140914] Epoch: [3]  [800/812]  lr: 0.000005  grad_norm: 0.8628 (0.9097)  closs: 0.8552 (0.8350)  time: 2.1291  data: 0.0002  max mem: 28042
+[08:22:39.368185] Epoch: [3]  [810/812]  lr: 0.000005  grad_norm: 0.8627 (0.9094)  closs: 0.9012 (0.8357)  time: 2.1268  data: 0.0002  max mem: 28042
+[08:22:41.821347] Epoch: [3] Total time: 0:28:46
+[08:22:41.825202] Averaged stats: lr: 0.000005  grad_norm: 0.8627 (0.9092)  closs: 0.9007 (0.8364)
+[08:22:42.226562] model saved
+[08:22:43.924087] optimizer saved
+[08:22:43.925003] other rank-common saved
+[08:22:43.931834] rank-specific saved
+[08:22:43.932142] Training time 1:55:22
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f050e55b07facfaec1e5a03de296bde911239050
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d5c15a2808412d27835f14c0fc05ef1fc310923302af1fb9c02c445a59b7304
+size 58162939
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6aa599035681e620d1e8181b9ffd84a7bf151194
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f565b9183e251c048fae68754e0b1a015c651fd43b0cd71a3982f9b4d1077dd
+size 130819127
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e317414ddacd9d596ca17754b1fb347e8c620137
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11ad37201918bfc6a749b9c14caf28e560ac54b89a2d0fa3be30e50c49fab9cd
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..331afc6d009c4c4bdce415e1e05fe0a7c6fd04f8
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c997936cd82af0a6012259bafc3d2e9e2a6c0acd90f8762ec953c18c052c8c4
+size 58162939
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9f5c4bf62d29d14a7652cf8b73c31183e5b6b245
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:586da76697344d1409f6d0331176114e72fd47b98091fd6c258a4792eea01a82
+size 130819127
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2c36cacd3ff477892660b4bdf6361cc4ab5bf40a
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d32e2b269680e43657c7c735c0be6f8fd7672e839232b8381fccdf09d36792
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e4a9509a579916c44cb1ef97e02d070092a0fe9a
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b7acb10eef238d30c123f47b15af4c9c443ef414ab5f8737674883bf3d5189
+size 58162939
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e77163e87c30cafa46def5bdb669dc533a6005
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57e071734f60c90e438181c8ae0d2dd00a0cf7081ac674445bbeec548315a98a
+size 130819127
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e415f2ef15b0b4d08b397865c99738db07bf0a07
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3da9395be70ad1f7631e92aec6d4efc745e19ca78afde1004e4193976aea056d
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e71af1b7c315b37a4aea851461f8dcefbd6e27d7
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71aece4919e4237bb2920f0dfd74e23ac51a04fd74a56ab8efcab06962a6d3cd
+size 58162939
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2da15d2c95c60871c58625ee883ee9fff7857cc6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8c23cd50d34ab5d423af68a8e4a2fa3ba11191f6cbd5146638cc478d9f3bb6
+size 130819127
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b9125500778ab300ca76c74d5da10a548c4985af
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f931fea34d05be0494c0ab9718bce37ad922d058b64b820c329cd71d918f3a70
+size 1751
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..641deafc203ae506ccbc5ac592b83c4532ba0041
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt
@@ -0,0 +1,4 @@
+{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.6733022602687916, "train_closs": 0.9640464621499695, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.6733022602687916, "val_closs": 0.9640464621499695}
+{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.5561584743299508, "train_closs": 0.9136834735638035, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.5561584743299508, "val_closs": 0.9136834735638035}
+{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.5519019591764276, "train_closs": 0.9010383364437102, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.5519019591764276, "val_closs": 0.9010383364437102}
+{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.5554111024796082, "train_closs": 0.8950173630897561, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.5554111024796082, "val_closs": 0.8950173630897561}
diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..9d6d14604302702cadfc48b3ace8125e28a27c43
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log
@@ -0,0 +1,648 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 6): env://, gpu 6
+| distributed init (rank 4): env://, gpu 4
+| distributed init (rank 7): env://, gpu 7
+| distributed init (rank 3): env://, gpu 3
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 2): env://, gpu 2
+| distributed init (rank 5): env://, gpu 5
+[04:34:34.123451] > initializing model parallel with size 1
+[04:34:34.123679] > initializing ddp with size 8
+[04:34:34.123687] > initializing pipeline with size 1
+[04:34:34.273785] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[04:34:34.273908] Namespace(batch_size=8,
+accum_iter=1,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-7b/params.json',
+'configs/model/finetune/sg/llamaPeft_normBiasLora.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-7b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-7b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B',
+log_dir='./output_dir',
+save_interval=1,
+only_save_trainable=True,
+device='cuda',
+seed=0,
+resume='',
+num_workers=24,
+pin_mem=True,
+world_size=8,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[04:34:34.276112] Start initialization.
+[04:34:34.276172] ## Processing on RANK 0.
+[04:34:34.285967] Model Args:
+ ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True)
+[04:35:23.173165] Model is Peft: True
+[04:35:23.179946] Trainable parameter count : 41603072 (local rank), 41603072 (all).
+[04:35:23.288029] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:35:40.542782] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:10, 76.29it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 48.01it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.82it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.71it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:45, 17.68it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.69it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.69it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:28, 28.38it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 30.76it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:50, 15.61it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:56, 14.02it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:36, 21.59it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.45it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:28, 27.17it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 29.58it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:50, 15.21it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:55, 13.69it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:35, 21.01it/s]Qunatization Process:  11%|█         | 90/839 [00:04<00:31, 23.81it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:28, 26.45it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:25, 28.91it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:48, 15.11it/s]Qunatization Process:  13%|█▎        | 105/839 [00:05<00:53, 13.62it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:34, 20.90it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:30, 23.71it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:27, 26.38it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 28.86it/s]Qunatization Process:  15%|█▌        | 128/839 [00:06<00:47, 15.09it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:52, 13.60it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:33, 20.87it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:29, 23.67it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:26, 26.35it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:23, 28.82it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:45, 15.07it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:50, 13.61it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:32, 20.87it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:28, 23.56it/s]Qunatization Process:  21%|██        | 172/839 [00:08<00:25, 26.24it/s]Qunatization Process:  21%|██        | 176/839 [00:08<00:23, 28.71it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:43, 15.02it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:48, 13.57it/s]Qunatization Process:  23%|██▎       | 190/839 [00:09<00:31, 20.82it/s]Qunatization Process:  23%|██▎       | 194/839 [00:09<00:27, 23.61it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:24, 26.28it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:22, 28.59it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:42, 15.05it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:46, 13.61it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:29, 20.88it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:26, 23.71it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:23, 26.42it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:21, 28.92it/s]Qunatization Process:  28%|██▊       | 232/839 [00:11<00:39, 15.46it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:43, 14.03it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.44it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 24.13it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 26.80it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:20, 29.25it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:37, 15.56it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:40, 14.10it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:26, 21.54it/s]Qunatization Process:  32%|███▏      | 272/839 [00:13<00:23, 24.33it/s]Qunatization Process:  33%|███▎      | 276/839 [00:13<00:20, 26.98it/s]Qunatization Process:  33%|███▎      | 280/839 [00:13<00:19, 29.24it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:35, 15.61it/s]Qunatization Process:  34%|███▍      | 287/839 [00:14<00:39, 14.09it/s]Qunatization Process:  35%|███▌      | 294/839 [00:14<00:25, 21.52it/s]Qunatization Process:  36%|███▌      | 298/839 [00:14<00:22, 24.32it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:19, 26.98it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:18, 29.40it/s]Qunatization Process:  37%|███▋      | 310/839 [00:15<00:33, 15.59it/s]Qunatization Process:  37%|███▋      | 313/839 [00:15<00:37, 14.12it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:24, 21.57it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:21, 24.26it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 26.91it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:17, 29.34it/s]Qunatization Process:  40%|████      | 336/839 [00:16<00:32, 15.60it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:35, 14.13it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.56it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:20, 24.36it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 27.01it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 29.27it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.64it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.11it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.51it/s]Qunatization Process:  45%|████▍     | 376/839 [00:18<00:19, 24.25it/s]Qunatization Process:  45%|████▌     | 380/839 [00:18<00:17, 26.85it/s]Qunatization Process:  46%|████▌     | 384/839 [00:18<00:15, 29.21it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:29, 15.17it/s]Qunatization Process:  47%|████▋     | 391/839 [00:19<00:32, 13.68it/s]Qunatization Process:  47%|████▋     | 398/839 [00:19<00:21, 20.95it/s]Qunatization Process:  48%|████▊     | 402/839 [00:19<00:18, 23.63it/s]Qunatization Process:  48%|████▊     | 406/839 [00:19<00:16, 26.20it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:15, 28.55it/s]Qunatization Process:  49%|████▉     | 414/839 [00:20<00:28, 14.66it/s]Qunatization Process:  50%|████▉     | 417/839 [00:20<00:32, 13.09it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:20, 19.96it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:18, 22.52it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:16, 24.83it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:14, 27.06it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:21<00:28, 14.15it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:31, 12.76it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:19, 19.53it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:22<00:17, 22.11it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:22<00:15, 24.58it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:22<00:14, 26.79it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:26, 14.12it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:23<00:29, 12.73it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:23<00:18, 19.50it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:23<00:16, 22.08it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:23<00:14, 24.55it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:23<00:13, 26.81it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:24<00:24, 14.10it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:24<00:27, 12.72it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:24<00:17, 19.47it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:15, 22.06it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:13, 24.53it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:25<00:12, 26.78it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:25<00:22, 14.09it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:26<00:24, 12.72it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:26<00:15, 19.45it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:26<00:13, 22.03it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:26<00:12, 24.49it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:26<00:11, 26.76it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:27<00:20, 14.08it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:27<00:22, 12.72it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:27<00:14, 19.47it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:27<00:12, 21.99it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:27<00:11, 24.45it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:27<00:10, 26.71it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:28<00:19, 14.06it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:28<00:20, 12.70it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:28<00:13, 19.44it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:29<00:11, 22.01it/s]Qunatization Process:  70%|███████   | 588/839 [00:29<00:10, 24.33it/s]Qunatization Process:  71%|███████   | 592/839 [00:29<00:09, 26.59it/s]Qunatization Process:  71%|███████   | 596/839 [00:29<00:17, 14.01it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:30<00:18, 12.68it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:30<00:12, 19.41it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:30<00:10, 22.00it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:30<00:09, 24.57it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:30<00:08, 26.88it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:31<00:14, 14.47it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:31<00:16, 13.11it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:31<00:10, 20.06it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:31<00:08, 22.71it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:31<00:07, 25.23it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:31<00:07, 27.48it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:32<00:13, 14.23it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:32<00:14, 12.79it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:33<00:09, 19.59it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:33<00:07, 22.19it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:33<00:07, 24.67it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:33<00:06, 26.95it/s]Qunatization Process:  80%|████████  | 674/839 [00:33<00:11, 14.12it/s]Qunatization Process:  81%|████████  | 677/839 [00:34<00:12, 12.76it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:34<00:07, 19.48it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:34<00:06, 22.07it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:34<00:05, 24.55it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:34<00:05, 26.82it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:35<00:09, 14.23it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:35<00:10, 12.99it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:35<00:06, 19.94it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:35<00:05, 22.64it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:36<00:04, 25.17it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:36<00:04, 27.56it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:36<00:07, 14.80it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:36<00:08, 13.41it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:37<00:05, 20.49it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:37<00:04, 23.16it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:37<00:03, 25.69it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:37<00:03, 27.86it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:38<00:05, 14.87it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:38<00:06, 13.43it/s]Qunatization Process:  91%|█████████ | 762/839 [00:38<00:03, 20.51it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:38<00:03, 23.17it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:38<00:02, 25.70it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:38<00:02, 28.02it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:39<00:04, 14.90it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:39<00:04, 13.48it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:39<00:02, 20.58it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:39<00:02, 23.24it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:39<00:01, 25.66it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:40<00:01, 27.98it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:40<00:02, 14.90it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:40<00:02, 13.42it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:41<00:01, 20.51it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:41<00:00, 23.18it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:41<00:00, 25.71it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:41<00:00, 28.03it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:41<00:00, 14.93it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:42<00:00, 13.46it/s]Qunatization Process: 100%|██████████| 839/839 [00:43<00:00,  9.74it/s]Qunatization Process: 100%|██████████| 839/839 [00:43<00:00, 19.45it/s]
+[04:36:36.929912] ## Processing on RANK 1.
+[04:37:25.542968] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:37:40.843000] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:11, 73.48it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 47.39it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.46it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.50it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:45, 17.68it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.75it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.84it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:27, 28.61it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 31.06it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:49, 15.88it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:54, 14.32it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:35, 22.06it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.96it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:27, 27.71it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 30.21it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:48, 15.88it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:52, 14.35it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:34, 21.97it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.85it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 27.59it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:24, 30.09it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:46, 15.90it/s]Qunatization Process:  13%|█▎        | 105/839 [00:04<00:51, 14.36it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.97it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.85it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 27.57it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:23, 30.07it/s]Qunatization Process:  15%|█▌        | 128/839 [00:05<00:44, 15.90it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:49, 14.36it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:31, 21.97it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:28, 24.84it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:25, 27.56it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:22, 30.06it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:43, 15.91it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:47, 14.37it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:30, 21.99it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:26, 24.86it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:24, 27.58it/s]Qunatization Process:  21%|██        | 176/839 [00:07<00:22, 30.08it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:41, 15.90it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:45, 14.37it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:29, 21.98it/s]Qunatization Process:  23%|██▎       | 194/839 [00:08<00:25, 24.83it/s]Qunatization Process:  24%|██▎       | 198/839 [00:08<00:23, 27.30it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.48it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:41, 15.32it/s]Qunatization Process:  25%|██▍       | 209/839 [00:09<00:45, 13.96it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:29, 21.44it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.34it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 27.12it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.70it/s]Qunatization Process:  28%|██▊       | 232/839 [00:10<00:38, 15.83it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:42, 14.33it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.93it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:23, 24.81it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 27.55it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:19, 30.06it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:36, 15.89it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:40, 14.36it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:25, 21.97it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:22, 24.84it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:20, 27.57it/s]Qunatization Process:  33%|███▎      | 280/839 [00:12<00:18, 30.08it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:34, 15.91it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:38, 14.38it/s]Qunatization Process:  35%|███▌      | 294/839 [00:13<00:24, 21.99it/s]Qunatization Process:  36%|███▌      | 298/839 [00:13<00:21, 24.88it/s]Qunatization Process:  36%|███▌      | 302/839 [00:13<00:19, 27.61it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:17, 30.10it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:33, 15.92it/s]Qunatization Process:  37%|███▋      | 313/839 [00:14<00:36, 14.39it/s]Qunatization Process:  38%|███▊      | 320/839 [00:14<00:23, 22.01it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:20, 24.88it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 27.57it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:16, 30.07it/s]Qunatization Process:  40%|████      | 336/839 [00:15<00:31, 15.92it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:34, 14.38it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.98it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:19, 24.85it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 27.57it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:15, 30.07it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:29, 15.92it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:32, 14.39it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 22.00it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:18, 24.88it/s]Qunatization Process:  45%|████▌     | 380/839 [00:17<00:16, 27.61it/s]Qunatization Process:  46%|████▌     | 384/839 [00:17<00:15, 30.11it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:28, 15.70it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:31, 14.11it/s]Qunatization Process:  47%|████▋     | 398/839 [00:18<00:20, 21.60it/s]Qunatization Process:  48%|████▊     | 402/839 [00:18<00:17, 24.45it/s]Qunatization Process:  48%|████▊     | 406/839 [00:18<00:15, 27.14it/s]Qunatization Process:  49%|████▉     | 410/839 [00:18<00:14, 29.55it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:27, 15.26it/s]Qunatization Process:  50%|████▉     | 417/839 [00:19<00:30, 13.72it/s]Qunatization Process:  51%|█████     | 424/839 [00:19<00:19, 21.03it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:17, 23.83it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 26.50it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 28.96it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:20<00:26, 15.13it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:29, 13.62it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 20.89it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:16, 23.69it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 26.37it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:13, 28.83it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.10it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:27, 13.62it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:17, 20.88it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:15, 23.68it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:22<00:13, 26.36it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:22<00:12, 28.82it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.09it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:23<00:25, 13.61it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:23<00:16, 20.87it/s]Qunatization Process:  60%|██████    | 506/839 [00:23<00:14, 23.67it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 26.34it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 28.80it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:24<00:21, 15.09it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:23, 13.62it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 20.87it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:12, 23.67it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 26.34it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 28.79it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:19, 15.08it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:21, 13.61it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 20.86it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 23.64it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 26.30it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 28.75it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.06it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:19, 13.59it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:12, 20.83it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:27<00:10, 23.62it/s]Qunatization Process:  70%|███████   | 588/839 [00:27<00:09, 26.28it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 28.73it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:16, 15.06it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:28<00:17, 13.59it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:11, 20.83it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 23.61it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.26it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 28.70it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:29<00:14, 15.05it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 13.58it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 20.81it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 23.58it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.22it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 28.65it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 15.04it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 13.57it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 20.80it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 23.57it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 26.21it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:31<00:05, 28.64it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:10, 15.03it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 13.56it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:32<00:07, 20.78it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:33<00:06, 23.55it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.14it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:05, 28.58it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:09, 15.01it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:34<00:10, 13.55it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 20.77it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 23.53it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 26.17it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:04, 28.60it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:35<00:07, 15.02it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:08, 13.55it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 20.76it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 23.52it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 26.15it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:35<00:03, 28.58it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 15.00it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:06, 13.54it/s]Qunatization Process:  91%|█████████ | 762/839 [00:36<00:03, 20.74it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:36<00:03, 23.49it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:37<00:02, 26.11it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:37<00:02, 28.53it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:04, 14.99it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:38<00:04, 13.53it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:38<00:02, 20.73it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:38<00:02, 23.48it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 26.11it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 28.52it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:39<00:02, 14.99it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 13.52it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 20.71it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 23.46it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 26.08it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 28.48it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:40<00:00, 14.98it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 13.52it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00,  9.89it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.23it/s]
+[04:38:31.386171] ## Processing on RANK 2.
+[04:39:19.865979] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:39:33.229138] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:11, 74.62it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 48.03it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.82it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.69it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:45, 17.67it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.69it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.71it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:28, 28.42it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 30.80it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:50, 15.68it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:55, 14.13it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:35, 21.77it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.63it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:28, 27.33it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 29.63it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:48, 15.63it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:53, 14.13it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:34, 21.64it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.49it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 27.19it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:24, 29.66it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:47, 15.66it/s]Qunatization Process:  13%|█▎        | 105/839 [00:05<00:51, 14.15it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.66it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.50it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 27.19it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 29.67it/s]Qunatization Process:  15%|█▌        | 128/839 [00:05<00:45, 15.62it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:50, 14.12it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:32, 21.62it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:28, 24.46it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:25, 27.15it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:23, 29.63it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:43, 15.65it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:48, 14.14it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:31, 21.64it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:27, 24.48it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:24, 27.17it/s]Qunatization Process:  21%|██        | 176/839 [00:07<00:22, 29.65it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:42, 15.66it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:46, 14.11it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:30, 21.59it/s]Qunatization Process:  23%|██▎       | 194/839 [00:08<00:26, 24.43it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:23, 27.14it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.61it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:40, 15.65it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:44, 14.14it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:28, 21.64it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.49it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 27.18it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.65it/s]Qunatization Process:  28%|██▊       | 232/839 [00:10<00:38, 15.66it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:42, 14.15it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.51it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 24.06it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:22, 26.54it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:20, 28.78it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:38, 14.99it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:42, 13.51it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:27, 20.66it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:24, 23.35it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:21, 25.90it/s]Qunatization Process:  33%|███▎      | 280/839 [00:13<00:19, 28.23it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:37, 14.88it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:41, 13.44it/s]Qunatization Process:  35%|███▌      | 294/839 [00:14<00:26, 20.56it/s]Qunatization Process:  36%|███▌      | 298/839 [00:14<00:23, 23.26it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:20, 25.82it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:18, 28.17it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:35, 14.95it/s]Qunatization Process:  37%|███▋      | 313/839 [00:15<00:38, 13.70it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:24, 21.05it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:21, 23.93it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:19, 26.68it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:17, 29.21it/s]Qunatization Process:  40%|████      | 336/839 [00:16<00:32, 15.57it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:35, 14.09it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.55it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:20, 24.39it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 27.09it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 29.57it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.59it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.10it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.58it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:18, 24.42it/s]Qunatization Process:  45%|████▌     | 380/839 [00:18<00:16, 27.12it/s]Qunatization Process:  46%|████▌     | 384/839 [00:18<00:15, 29.53it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:28, 15.61it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:31, 14.12it/s]Qunatization Process:  47%|████▋     | 398/839 [00:19<00:20, 21.61it/s]Qunatization Process:  48%|████▊     | 402/839 [00:19<00:17, 24.45it/s]Qunatization Process:  48%|████▊     | 406/839 [00:19<00:15, 27.14it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 29.62it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:27, 15.65it/s]Qunatization Process:  50%|████▉     | 417/839 [00:20<00:29, 14.10it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 21.58it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:16, 24.42it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 27.11it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 29.59it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:21<00:25, 15.64it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:28, 14.14it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:17, 21.62it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:15, 24.46it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 27.14it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:12, 29.60it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:23, 15.60it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:26, 14.10it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:16, 21.57it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:14, 24.39it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:23<00:13, 26.95it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:23<00:11, 29.44it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.58it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:23<00:24, 14.08it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:24<00:15, 21.55it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:13, 24.37it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 27.07it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 29.53it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:24<00:20, 15.61it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:22, 14.12it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 21.60it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:12, 24.45it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 27.15it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 29.62it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:18, 15.57it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:20, 14.08it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 21.54it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 24.35it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 27.05it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 29.53it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.63it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:18, 14.13it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:11, 21.61it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:27<00:10, 24.45it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 27.16it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 29.63it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:15, 15.59it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:28<00:17, 14.10it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:10, 21.58it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 24.42it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 27.14it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 29.62it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:29<00:13, 15.63it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 14.13it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 21.62it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 24.34it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.85it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 29.23it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 15.57it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 14.09it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 21.50it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 24.35it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 27.07it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:31<00:05, 29.57it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:10, 15.65it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 14.14it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:32<00:07, 21.65it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:32<00:06, 24.50it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 27.21it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:04, 29.68it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:08, 15.66it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:33<00:09, 14.15it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:05, 21.65it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 24.50it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 27.20it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:03, 29.51it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:34<00:07, 15.62it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:07, 14.13it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 21.62it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 24.46it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 27.17it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:35<00:03, 29.64it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 15.65it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:05, 14.15it/s]Qunatization Process:  91%|█████████ | 762/839 [00:36<00:03, 21.64it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:36<00:02, 24.48it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:36<00:02, 27.16it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:36<00:02, 29.63it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:03, 15.60it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:37<00:04, 14.11it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:37<00:02, 21.60it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:37<00:01, 24.44it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 27.15it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 29.63it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:38<00:02, 15.66it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:38<00:02, 14.14it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 21.63it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 24.47it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 27.17it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 29.65it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:39<00:00, 15.64it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 14.08it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 10.35it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.43it/s]
+[04:40:23.665530] ## Processing on RANK 3.
+[04:41:12.268092] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:41:25.151875] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:11, 75.49it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 47.69it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.40it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.41it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:46, 17.46it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.39it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.39it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:28, 28.09it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:26, 30.47it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:50, 15.50it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:56, 13.97it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:36, 21.52it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.35it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:28, 27.03it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:26, 29.47it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:49, 15.49it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:54, 14.00it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:35, 21.43it/s]Qunatization Process:  11%|█         | 90/839 [00:04<00:30, 24.25it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 26.91it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:25, 29.35it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:47, 15.48it/s]Qunatization Process:  13%|█▎        | 105/839 [00:05<00:52, 13.98it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.38it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.19it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 26.83it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 28.84it/s]Qunatization Process:  15%|█▌        | 128/839 [00:06<00:47, 14.92it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:52, 13.42it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:34, 20.50it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:30, 23.17it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:26, 25.70it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:24, 28.02it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:45, 14.94it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:49, 13.66it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:32, 20.98it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:28, 23.82it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:25, 26.53it/s]Qunatization Process:  21%|██        | 176/839 [00:08<00:22, 29.02it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:42, 15.43it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:46, 13.96it/s]Qunatization Process:  23%|██▎       | 190/839 [00:09<00:30, 21.37it/s]Qunatization Process:  23%|██▎       | 194/839 [00:09<00:26, 24.18it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:23, 26.85it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.29it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:40, 15.47it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:45, 13.97it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:29, 21.36it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.06it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 26.76it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.24it/s]Qunatization Process:  28%|██▊       | 232/839 [00:11<00:39, 15.49it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:43, 13.99it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.41it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 24.23it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 26.91it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:19, 29.35it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:37, 15.49it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:41, 14.00it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:26, 21.42it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:23, 24.23it/s]Qunatization Process:  33%|███▎      | 276/839 [00:13<00:20, 26.92it/s]Qunatization Process:  33%|███▎      | 280/839 [00:13<00:19, 29.35it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:35, 15.49it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:39, 14.00it/s]Qunatization Process:  35%|███▌      | 294/839 [00:14<00:25, 21.41it/s]Qunatization Process:  36%|███▌      | 298/839 [00:14<00:22, 24.22it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:19, 26.85it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:18, 29.29it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:34, 15.48it/s]Qunatization Process:  37%|███▋      | 313/839 [00:15<00:37, 13.99it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:24, 21.40it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:21, 24.05it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:19, 26.74it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:17, 29.22it/s]Qunatization Process:  40%|████      | 336/839 [00:16<00:32, 15.39it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:35, 13.92it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:23, 21.30it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:20, 24.12it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:18, 26.80it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 29.27it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.47it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 13.99it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.41it/s]Qunatization Process:  45%|████▍     | 376/839 [00:18<00:19, 24.22it/s]Qunatization Process:  45%|████▌     | 380/839 [00:18<00:17, 26.88it/s]Qunatization Process:  46%|████▌     | 384/839 [00:18<00:15, 29.33it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:29, 15.49it/s]Qunatization Process:  47%|████▋     | 391/839 [00:19<00:31, 14.00it/s]Qunatization Process:  47%|████▋     | 398/839 [00:19<00:20, 21.41it/s]Qunatization Process:  48%|████▊     | 402/839 [00:19<00:18, 24.22it/s]Qunatization Process:  48%|████▊     | 406/839 [00:19<00:16, 26.86it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 29.26it/s]Qunatization Process:  49%|████▉     | 414/839 [00:20<00:27, 15.45it/s]Qunatization Process:  50%|████▉     | 417/839 [00:20<00:30, 13.96it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 21.28it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:17, 24.10it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 26.78it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 29.25it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:21<00:25, 15.47it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:28, 13.98it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 21.39it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:15, 24.21it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 26.88it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:22<00:12, 29.33it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.48it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:26, 13.99it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:16, 21.40it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:23<00:14, 24.20it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:23<00:13, 26.85it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:23<00:11, 29.27it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.47it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:24<00:24, 13.98it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:24<00:15, 21.39it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:13, 24.09it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 26.69it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 29.13it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:25<00:20, 15.44it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:22, 13.95it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 21.23it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:12, 24.05it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 26.73it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 29.20it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:19, 15.45it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:20, 13.96it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 21.36it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 24.17it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 26.84it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:27<00:09, 29.29it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.46it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:19, 13.98it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:28<00:12, 21.38it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:28<00:10, 24.17it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 26.85it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 29.23it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:15, 15.44it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:29<00:17, 13.94it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:10, 21.30it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 24.10it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.71it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 29.15it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:30<00:14, 15.43it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 13.90it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 21.28it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 24.09it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.77it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 29.22it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 15.45it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 13.96it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 21.37it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 24.17it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:32<00:06, 26.84it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:32<00:05, 29.22it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:10, 15.45it/s]Qunatization Process:  81%|████████  | 677/839 [00:33<00:11, 13.97it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:33<00:07, 21.36it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:33<00:06, 24.16it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.78it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:04, 29.20it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:09, 15.42it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:34<00:09, 13.93it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 21.29it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 24.07it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 26.69it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:04, 29.14it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:35<00:07, 15.41it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:07, 13.89it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 21.28it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 24.09it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 26.76it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:35<00:03, 29.22it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 15.43it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:06, 13.94it/s]Qunatization Process:  91%|█████████ | 762/839 [00:36<00:03, 21.32it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:37<00:03, 24.12it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:37<00:02, 26.75it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:37<00:02, 29.20it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:03, 15.44it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:38<00:04, 13.96it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:38<00:02, 21.34it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:38<00:01, 24.14it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 26.79it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 29.18it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:39<00:02, 15.43it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 13.93it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 21.32it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 24.08it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 26.76it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 29.23it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:40<00:00, 15.44it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 13.91it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 10.23it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.24it/s]
+[04:42:16.030529] ## Processing on RANK 4.
+[04:43:05.205352] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:43:17.995817] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:10, 75.62it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 47.52it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.56it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.49it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:46, 17.53it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.50it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.49it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:28, 28.18it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 30.54it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:50, 15.55it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:56, 13.97it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:36, 21.44it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.29it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:28, 26.99it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:26, 29.46it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:49, 15.53it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:54, 14.03it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:35, 21.49it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.31it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 26.99it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:25, 29.45it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:47, 15.51it/s]Qunatization Process:  13%|█▎        | 105/839 [00:05<00:52, 14.02it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.46it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.28it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 26.97it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 29.43it/s]Qunatization Process:  15%|█▌        | 128/839 [00:06<00:45, 15.55it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:50, 14.04it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:32, 21.29it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:29, 23.91it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:26, 26.36it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:24, 28.56it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:45, 14.97it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:49, 13.68it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:32, 21.00it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:28, 23.85it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:25, 26.57it/s]Qunatization Process:  21%|██        | 176/839 [00:08<00:22, 29.07it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:42, 15.37it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:47, 13.93it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:30, 21.33it/s]Qunatization Process:  23%|██▎       | 194/839 [00:09<00:26, 24.16it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:23, 26.85it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.32it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:40, 15.51it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:44, 14.02it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:29, 21.46it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.28it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 26.95it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.22it/s]Qunatization Process:  28%|██▊       | 232/839 [00:11<00:39, 15.50it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:43, 14.01it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.45it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 24.27it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 26.95it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:19, 29.40it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:37, 15.53it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:41, 14.03it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:26, 21.47it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:23, 24.28it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:20, 26.95it/s]Qunatization Process:  33%|███▎      | 280/839 [00:13<00:19, 29.41it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:35, 15.54it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:39, 14.04it/s]Qunatization Process:  35%|███▌      | 294/839 [00:14<00:25, 21.48it/s]Qunatization Process:  36%|███▌      | 298/839 [00:14<00:22, 24.29it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:19, 26.97it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:18, 29.41it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:34, 15.46it/s]Qunatization Process:  37%|███▋      | 313/839 [00:15<00:37, 13.99it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:24, 21.42it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:21, 24.24it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 26.93it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:17, 29.38it/s]Qunatization Process:  40%|████      | 336/839 [00:16<00:32, 15.53it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:35, 14.03it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.45it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:20, 24.15it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:18, 26.83it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 29.30it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.51it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.01it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.45it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:19, 24.27it/s]Qunatization Process:  45%|████▌     | 380/839 [00:18<00:17, 26.95it/s]Qunatization Process:  46%|████▌     | 384/839 [00:18<00:15, 29.40it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:29, 15.48it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:31, 14.01it/s]Qunatization Process:  47%|████▋     | 398/839 [00:19<00:20, 21.43it/s]Qunatization Process:  48%|████▊     | 402/839 [00:19<00:18, 24.25it/s]Qunatization Process:  48%|████▊     | 406/839 [00:19<00:16, 26.93it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 29.39it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:27, 15.53it/s]Qunatization Process:  50%|████▉     | 417/839 [00:20<00:30, 14.03it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 21.47it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:16, 24.28it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 26.96it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 29.24it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:21<00:25, 15.46it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:28, 13.99it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 21.42it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:15, 24.22it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 26.89it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:12, 29.35it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.52it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:26, 14.00it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:16, 21.39it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:14, 24.21it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:23<00:13, 26.89it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:23<00:11, 29.35it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.52it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:24<00:24, 14.03it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:24<00:15, 21.46it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:13, 24.27it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 26.95it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 29.40it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:25<00:20, 15.50it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:22, 14.01it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 21.44it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:12, 24.26it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 26.94it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 29.39it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:18, 15.53it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:20, 14.03it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 21.39it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 24.20it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 26.89it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 29.35it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.52it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:18, 14.03it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:12, 21.46it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:28<00:10, 24.28it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 26.95it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 29.40it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:15, 15.53it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:29<00:17, 14.04it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:10, 21.47it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 24.29it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.96it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 29.41it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:30<00:13, 15.53it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 14.04it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 21.47it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 24.29it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.96it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 29.41it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 15.50it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 14.01it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 21.44it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 24.25it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 26.93it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:32<00:05, 29.38it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:10, 15.53it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 13.99it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:32<00:07, 21.41it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:33<00:06, 24.11it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.80it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:04, 29.27it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:08, 15.50it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:34<00:09, 14.01it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 21.44it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 24.25it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 26.93it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:03, 29.37it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:35<00:07, 15.48it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:07, 14.00it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 21.42it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 24.24it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 26.91it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:35<00:03, 29.36it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 15.52it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:05, 14.02it/s]Qunatization Process:  91%|█████████ | 762/839 [00:36<00:03, 21.45it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:36<00:03, 24.26it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:36<00:02, 26.94it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:37<00:02, 29.38it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:03, 15.48it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:37<00:04, 13.99it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:38<00:02, 21.41it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:38<00:01, 24.23it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 26.90it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 29.36it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:38<00:02, 15.46it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 13.98it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 21.39it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 24.21it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 26.90it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 29.37it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:40<00:00, 15.51it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 14.01it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 10.25it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.34it/s]
+[04:44:08.455032] ## Processing on RANK 5.
+[04:44:57.145191] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:45:12.887102] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:11, 73.17it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 47.04it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.39it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.45it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:45, 17.64it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.71it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.80it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:27, 28.57it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 31.03it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:49, 15.87it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:54, 14.32it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:35, 22.06it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.92it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:27, 27.66it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 30.16it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:48, 15.88it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:52, 14.35it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:34, 21.96it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.84it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 27.57it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:24, 30.08it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:46, 15.88it/s]Qunatization Process:  13%|█▎        | 105/839 [00:04<00:51, 14.35it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.77it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.45it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 26.95it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 29.21it/s]Qunatization Process:  15%|█▌        | 128/839 [00:05<00:46, 15.24it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:51, 13.74it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:33, 21.00it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:29, 23.72it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:26, 26.31it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:24, 28.69it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:45, 15.15it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:49, 13.67it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:32, 20.91it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:28, 23.65it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:25, 26.24it/s]Qunatization Process:  21%|██        | 176/839 [00:07<00:23, 28.68it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:42, 15.62it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:46, 14.19it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:29, 21.76it/s]Qunatization Process:  23%|██▎       | 194/839 [00:09<00:26, 24.65it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:23, 27.40it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.93it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:39, 15.88it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:43, 14.34it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:28, 21.94it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:24, 24.82it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 27.54it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 30.05it/s]Qunatization Process:  28%|██▊       | 232/839 [00:10<00:38, 15.89it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:42, 14.36it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.97it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:23, 24.85it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 27.58it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:19, 30.09it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:36, 15.91it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:40, 14.36it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:25, 21.96it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:22, 24.84it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:20, 27.56it/s]Qunatization Process:  33%|███▎      | 280/839 [00:12<00:18, 30.06it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:34, 15.90it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:38, 14.36it/s]Qunatization Process:  35%|███▌      | 294/839 [00:13<00:24, 21.97it/s]Qunatization Process:  36%|███▌      | 298/839 [00:13<00:21, 24.85it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:19, 27.58it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:17, 30.09it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:33, 15.88it/s]Qunatization Process:  37%|███▋      | 313/839 [00:14<00:36, 14.36it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:23, 21.97it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:20, 24.84it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 27.58it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:16, 30.07it/s]Qunatization Process:  40%|████      | 336/839 [00:15<00:31, 15.91it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:34, 14.37it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.96it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:19, 24.84it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 27.58it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:15, 30.09it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.87it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.35it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.96it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:18, 24.84it/s]Qunatization Process:  45%|████▌     | 380/839 [00:17<00:16, 27.58it/s]Qunatization Process:  46%|████▌     | 384/839 [00:17<00:15, 30.04it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:28, 15.58it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:31, 14.04it/s]Qunatization Process:  47%|████▋     | 398/839 [00:18<00:20, 21.50it/s]Qunatization Process:  48%|████▊     | 402/839 [00:18<00:17, 24.35it/s]Qunatization Process:  48%|████▊     | 406/839 [00:18<00:15, 27.06it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 29.48it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:27, 15.20it/s]Qunatization Process:  50%|████▉     | 417/839 [00:19<00:30, 13.69it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 20.99it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:17, 23.80it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 26.48it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 28.94it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:20<00:26, 15.13it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:29, 13.64it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 20.92it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:16, 23.73it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 26.42it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:13, 28.88it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.08it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:27, 13.61it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:17, 20.88it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:15, 23.68it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:22<00:13, 26.36it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:22<00:12, 28.83it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.09it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:23<00:25, 13.61it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:23<00:16, 20.87it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:14, 23.66it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 26.34it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 28.80it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:24<00:21, 15.07it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:23, 13.59it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 20.84it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:13, 23.55it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 26.22it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 28.69it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:19, 15.05it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:21, 13.51it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 20.73it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 23.52it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 26.20it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 28.66it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 14.99it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:19, 13.53it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:12, 20.74it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:27<00:10, 23.50it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 26.13it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 28.56it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:16, 14.96it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:29<00:17, 13.52it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:11, 20.71it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 23.47it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.00it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 28.43it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:30<00:14, 14.96it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 13.51it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 20.71it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 23.46it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.07it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 28.48it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 14.94it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 13.51it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 20.71it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 23.48it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 26.12it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:32<00:05, 28.56it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:11, 14.96it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 13.52it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:33<00:07, 20.72it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:33<00:06, 23.48it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.12it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:05, 28.39it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:09, 14.96it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:34<00:10, 13.51it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 20.63it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 23.40it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 26.05it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:04, 28.50it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:35<00:07, 14.99it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:08, 13.49it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 20.67it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 23.43it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 26.07it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:36<00:03, 28.49it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 14.94it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:06, 13.50it/s]Qunatization Process:  91%|█████████ | 762/839 [00:37<00:03, 20.68it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:37<00:03, 23.44it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:37<00:02, 26.06it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:37<00:02, 27.99it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:04, 14.37it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:38<00:04, 12.91it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:38<00:02, 19.69it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:38<00:02, 22.29it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 24.76it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 27.04it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:39<00:02, 14.15it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 12.78it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 19.59it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 22.19it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 24.67it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:40<00:00, 26.95it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:40<00:00, 14.13it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:41<00:00, 12.77it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00,  9.33it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.01it/s]
+[04:46:06.641981] ## Processing on RANK 6.
+[04:46:55.056678] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:47:08.245197] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:10, 76.04it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 48.11it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.72it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.61it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:46, 17.61it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.57it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.57it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:28, 28.26it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 30.58it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:50, 15.59it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:55, 14.06it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:35, 21.65it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.50it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:28, 27.20it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 29.65it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:48, 15.60it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:53, 14.08it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:35, 21.48it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.32it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 27.01it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:25, 29.49it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:47, 15.59it/s]Qunatization Process:  13%|█▎        | 105/839 [00:05<00:52, 14.09it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.57it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.39it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 27.08it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:24, 29.55it/s]Qunatization Process:  15%|█▌        | 128/839 [00:06<00:45, 15.60it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:50, 14.10it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:32, 21.57it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:28, 24.40it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:25, 27.09it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:23, 29.54it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:43, 15.60it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:48, 13.93it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:31, 21.19it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:28, 23.81it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:25, 26.23it/s]Qunatization Process:  21%|██        | 176/839 [00:07<00:23, 28.49it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:43, 15.19it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:47, 13.81it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:30, 21.19it/s]Qunatization Process:  23%|██▎       | 194/839 [00:09<00:26, 24.03it/s]Qunatization Process:  24%|██▎       | 198/839 [00:09<00:23, 26.73it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 29.24it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:40, 15.47it/s]Qunatization Process:  25%|██▍       | 209/839 [00:10<00:44, 14.01it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:29, 21.45it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.29it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 26.99it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.46it/s]Qunatization Process:  28%|██▊       | 232/839 [00:11<00:38, 15.59it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:42, 14.09it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:27, 21.56it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 24.38it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:21, 27.06it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:19, 29.51it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:37, 15.59it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:41, 14.09it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:26, 21.56it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:23, 24.38it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:20, 27.07it/s]Qunatization Process:  33%|███▎      | 280/839 [00:13<00:18, 29.54it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:35, 15.60it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:39, 14.10it/s]Qunatization Process:  35%|███▌      | 294/839 [00:13<00:25, 21.57it/s]Qunatization Process:  36%|███▌      | 298/839 [00:14<00:22, 24.40it/s]Qunatization Process:  36%|███▌      | 302/839 [00:14<00:19, 27.08it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:18, 29.54it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:33, 15.60it/s]Qunatization Process:  37%|███▋      | 313/839 [00:15<00:37, 14.10it/s]Qunatization Process:  38%|███▊      | 320/839 [00:15<00:24, 21.57it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:21, 24.39it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 27.07it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:17, 29.52it/s]Qunatization Process:  40%|████      | 336/839 [00:16<00:32, 15.55it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:35, 14.06it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.50it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:20, 24.32it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 26.96it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 29.40it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:30, 15.56it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.07it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.52it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:19, 24.35it/s]Qunatization Process:  45%|████▌     | 380/839 [00:17<00:16, 27.03it/s]Qunatization Process:  46%|████▌     | 384/839 [00:18<00:15, 29.48it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:28, 15.59it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:31, 14.08it/s]Qunatization Process:  47%|████▋     | 398/839 [00:18<00:20, 21.55it/s]Qunatization Process:  48%|████▊     | 402/839 [00:19<00:17, 24.37it/s]Qunatization Process:  48%|████▊     | 406/839 [00:19<00:16, 27.05it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 29.50it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:27, 15.59it/s]Qunatization Process:  50%|████▉     | 417/839 [00:20<00:29, 14.09it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 21.55it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:16, 24.37it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 27.05it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 29.50it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:21<00:25, 15.59it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:28, 14.09it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 21.55it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:15, 24.37it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 27.05it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:12, 29.51it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.54it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:26, 14.05it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:16, 21.50it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:14, 24.32it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:22<00:13, 27.00it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:23<00:11, 29.46it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:22, 15.57it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:23<00:24, 14.07it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:24<00:15, 21.53it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:13, 24.35it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 27.03it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 29.48it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:24<00:20, 15.58it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:22, 14.08it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 21.54it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:12, 24.37it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 27.04it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 29.49it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:18, 15.57it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:20, 14.08it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 21.54it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 24.36it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 27.05it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 29.51it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.59it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:18, 14.08it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:12, 21.54it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:27<00:10, 24.36it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 26.90it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 29.36it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:15, 15.55it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:28<00:17, 14.06it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:10, 21.51it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 24.32it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.99it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 29.44it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:29<00:13, 15.57it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 14.07it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 21.53it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 24.35it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 27.02it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 29.48it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 15.57it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 14.08it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 21.53it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 24.35it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 27.02it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:31<00:05, 29.47it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:10, 15.53it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 14.04it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:32<00:07, 21.49it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:32<00:06, 24.31it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.99it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:04, 29.44it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:08, 15.56it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:33<00:09, 14.03it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 21.46it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 24.28it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 26.96it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:03, 29.42it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:34<00:07, 15.56it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:07, 14.06it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 21.52it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 24.34it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 27.01it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:35<00:03, 29.45it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 15.56it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:05, 14.06it/s]Qunatization Process:  91%|█████████ | 762/839 [00:36<00:03, 21.51it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:36<00:03, 24.33it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:36<00:02, 27.01it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:36<00:02, 29.47it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:03, 15.56it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:37<00:04, 14.06it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:37<00:02, 21.51it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:37<00:01, 24.33it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 27.01it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 29.47it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:38<00:02, 15.56it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 14.06it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 21.51it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 24.32it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 27.00it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 29.46it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:39<00:00, 15.51it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 14.02it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 10.31it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.42it/s]
+[04:47:58.527261] ## Processing on RANK 7.
+[04:48:47.194123] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[04:49:02.656171] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/839 [00:00<?, ?it/s]Qunatization Process:   1%|          | 8/839 [00:00<00:11, 73.21it/s]Qunatization Process:   2%|▏         | 16/839 [00:00<00:17, 47.29it/s]Qunatization Process:   3%|▎         | 22/839 [00:00<00:30, 26.44it/s]Qunatization Process:   3%|▎         | 26/839 [00:00<00:37, 21.49it/s]Qunatization Process:   3%|▎         | 29/839 [00:01<00:45, 17.67it/s]Qunatization Process:   4%|▍         | 34/839 [00:01<00:35, 22.74it/s]Qunatization Process:   5%|▍         | 38/839 [00:01<00:31, 25.82it/s]Qunatization Process:   5%|▌         | 42/839 [00:01<00:27, 28.60it/s]Qunatization Process:   5%|▌         | 46/839 [00:01<00:25, 31.05it/s]Qunatization Process:   6%|▌         | 50/839 [00:02<00:49, 15.88it/s]Qunatization Process:   6%|▋         | 53/839 [00:02<00:54, 14.33it/s]Qunatization Process:   7%|▋         | 60/839 [00:02<00:35, 22.07it/s]Qunatization Process:   8%|▊         | 64/839 [00:02<00:31, 24.97it/s]Qunatization Process:   8%|▊         | 68/839 [00:02<00:27, 27.72it/s]Qunatization Process:   9%|▊         | 72/839 [00:02<00:25, 30.21it/s]Qunatization Process:   9%|▉         | 76/839 [00:03<00:47, 15.91it/s]Qunatization Process:   9%|▉         | 79/839 [00:03<00:52, 14.37it/s]Qunatization Process:  10%|█         | 86/839 [00:03<00:34, 21.97it/s]Qunatization Process:  11%|█         | 90/839 [00:03<00:30, 24.86it/s]Qunatization Process:  11%|█         | 94/839 [00:04<00:27, 27.54it/s]Qunatization Process:  12%|█▏        | 98/839 [00:04<00:24, 30.05it/s]Qunatization Process:  12%|█▏        | 102/839 [00:04<00:46, 15.89it/s]Qunatization Process:  13%|█▎        | 105/839 [00:04<00:51, 14.36it/s]Qunatization Process:  13%|█▎        | 112/839 [00:05<00:33, 21.97it/s]Qunatization Process:  14%|█▍        | 116/839 [00:05<00:29, 24.85it/s]Qunatization Process:  14%|█▍        | 120/839 [00:05<00:26, 27.58it/s]Qunatization Process:  15%|█▍        | 124/839 [00:05<00:23, 30.08it/s]Qunatization Process:  15%|█▌        | 128/839 [00:05<00:44, 15.89it/s]Qunatization Process:  16%|█▌        | 131/839 [00:06<00:49, 14.36it/s]Qunatization Process:  16%|█▋        | 138/839 [00:06<00:31, 21.96it/s]Qunatization Process:  17%|█▋        | 142/839 [00:06<00:28, 24.83it/s]Qunatization Process:  17%|█▋        | 146/839 [00:06<00:25, 27.56it/s]Qunatization Process:  18%|█▊        | 150/839 [00:06<00:22, 30.06it/s]Qunatization Process:  18%|█▊        | 154/839 [00:07<00:43, 15.90it/s]Qunatization Process:  19%|█▊        | 157/839 [00:07<00:47, 14.36it/s]Qunatization Process:  20%|█▉        | 164/839 [00:07<00:30, 21.96it/s]Qunatization Process:  20%|██        | 168/839 [00:07<00:27, 24.83it/s]Qunatization Process:  21%|██        | 172/839 [00:07<00:24, 27.56it/s]Qunatization Process:  21%|██        | 176/839 [00:07<00:22, 30.06it/s]Qunatization Process:  21%|██▏       | 180/839 [00:08<00:41, 15.90it/s]Qunatization Process:  22%|██▏       | 183/839 [00:08<00:45, 14.36it/s]Qunatization Process:  23%|██▎       | 190/839 [00:08<00:29, 21.97it/s]Qunatization Process:  23%|██▎       | 194/839 [00:08<00:25, 24.84it/s]Qunatization Process:  24%|██▎       | 198/839 [00:08<00:23, 27.57it/s]Qunatization Process:  24%|██▍       | 202/839 [00:09<00:21, 30.07it/s]Qunatization Process:  25%|██▍       | 206/839 [00:09<00:39, 15.90it/s]Qunatization Process:  25%|██▍       | 209/839 [00:09<00:44, 14.21it/s]Qunatization Process:  26%|██▌       | 216/839 [00:10<00:28, 21.62it/s]Qunatization Process:  26%|██▌       | 220/839 [00:10<00:25, 24.28it/s]Qunatization Process:  27%|██▋       | 224/839 [00:10<00:22, 26.81it/s]Qunatization Process:  27%|██▋       | 228/839 [00:10<00:20, 29.11it/s]Qunatization Process:  28%|██▊       | 232/839 [00:10<00:39, 15.23it/s]Qunatization Process:  28%|██▊       | 235/839 [00:11<00:43, 13.74it/s]Qunatization Process:  29%|██▉       | 242/839 [00:11<00:28, 21.01it/s]Qunatization Process:  29%|██▉       | 246/839 [00:11<00:24, 23.74it/s]Qunatization Process:  30%|██▉       | 250/839 [00:11<00:22, 26.58it/s]Qunatization Process:  30%|███       | 254/839 [00:11<00:20, 29.22it/s]Qunatization Process:  31%|███       | 258/839 [00:12<00:36, 15.73it/s]Qunatization Process:  31%|███       | 261/839 [00:12<00:40, 14.26it/s]Qunatization Process:  32%|███▏      | 268/839 [00:12<00:26, 21.83it/s]Qunatization Process:  32%|███▏      | 272/839 [00:12<00:22, 24.71it/s]Qunatization Process:  33%|███▎      | 276/839 [00:12<00:20, 27.45it/s]Qunatization Process:  33%|███▎      | 280/839 [00:12<00:18, 29.97it/s]Qunatization Process:  34%|███▍      | 284/839 [00:13<00:34, 15.88it/s]Qunatization Process:  34%|███▍      | 287/839 [00:13<00:38, 14.36it/s]Qunatization Process:  35%|███▌      | 294/839 [00:13<00:24, 21.96it/s]Qunatization Process:  36%|███▌      | 298/839 [00:13<00:21, 24.84it/s]Qunatization Process:  36%|███▌      | 302/839 [00:13<00:19, 27.56it/s]Qunatization Process:  36%|███▋      | 306/839 [00:14<00:17, 30.06it/s]Qunatization Process:  37%|███▋      | 310/839 [00:14<00:33, 15.89it/s]Qunatization Process:  37%|███▋      | 313/839 [00:14<00:36, 14.36it/s]Qunatization Process:  38%|███▊      | 320/839 [00:14<00:23, 21.97it/s]Qunatization Process:  39%|███▊      | 324/839 [00:15<00:20, 24.83it/s]Qunatization Process:  39%|███▉      | 328/839 [00:15<00:18, 27.55it/s]Qunatization Process:  40%|███▉      | 332/839 [00:15<00:16, 30.04it/s]Qunatization Process:  40%|████      | 336/839 [00:15<00:31, 15.89it/s]Qunatization Process:  40%|████      | 339/839 [00:16<00:34, 14.35it/s]Qunatization Process:  41%|████      | 346/839 [00:16<00:22, 21.93it/s]Qunatization Process:  42%|████▏     | 350/839 [00:16<00:19, 24.81it/s]Qunatization Process:  42%|████▏     | 354/839 [00:16<00:17, 27.54it/s]Qunatization Process:  43%|████▎     | 358/839 [00:16<00:16, 30.04it/s]Qunatization Process:  43%|████▎     | 362/839 [00:17<00:29, 15.91it/s]Qunatization Process:  44%|████▎     | 365/839 [00:17<00:33, 14.31it/s]Qunatization Process:  44%|████▍     | 372/839 [00:17<00:21, 21.85it/s]Qunatization Process:  45%|████▍     | 376/839 [00:17<00:18, 24.66it/s]Qunatization Process:  45%|████▌     | 380/839 [00:17<00:16, 27.34it/s]Qunatization Process:  46%|████▌     | 384/839 [00:17<00:15, 29.79it/s]Qunatization Process:  46%|████▌     | 388/839 [00:18<00:29, 15.13it/s]Qunatization Process:  47%|████▋     | 391/839 [00:18<00:32, 13.59it/s]Qunatization Process:  47%|████▋     | 398/839 [00:18<00:21, 20.81it/s]Qunatization Process:  48%|████▊     | 402/839 [00:18<00:18, 23.57it/s]Qunatization Process:  48%|████▊     | 406/839 [00:18<00:16, 26.23it/s]Qunatization Process:  49%|████▉     | 410/839 [00:19<00:14, 28.72it/s]Qunatization Process:  49%|████▉     | 414/839 [00:19<00:28, 15.08it/s]Qunatization Process:  50%|████▉     | 417/839 [00:19<00:30, 13.61it/s]Qunatization Process:  51%|█████     | 424/839 [00:20<00:19, 20.88it/s]Qunatization Process:  51%|█████     | 428/839 [00:20<00:17, 23.68it/s]Qunatization Process:  51%|█████▏    | 432/839 [00:20<00:15, 26.37it/s]Qunatization Process:  52%|█████▏    | 436/839 [00:20<00:13, 28.85it/s]Qunatization Process:  52%|█████▏    | 440/839 [00:20<00:26, 15.10it/s]Qunatization Process:  53%|█████▎    | 443/839 [00:21<00:29, 13.62it/s]Qunatization Process:  54%|█████▎    | 450/839 [00:21<00:18, 20.89it/s]Qunatization Process:  54%|█████▍    | 454/839 [00:21<00:16, 23.69it/s]Qunatization Process:  55%|█████▍    | 458/839 [00:21<00:14, 26.37it/s]Qunatization Process:  55%|█████▌    | 462/839 [00:21<00:13, 28.84it/s]Qunatization Process:  56%|█████▌    | 466/839 [00:22<00:24, 15.07it/s]Qunatization Process:  56%|█████▌    | 469/839 [00:22<00:27, 13.60it/s]Qunatization Process:  57%|█████▋    | 476/839 [00:22<00:17, 20.85it/s]Qunatization Process:  57%|█████▋    | 480/839 [00:22<00:15, 23.64it/s]Qunatization Process:  58%|█████▊    | 484/839 [00:22<00:13, 26.31it/s]Qunatization Process:  58%|█████▊    | 488/839 [00:22<00:12, 28.76it/s]Qunatization Process:  59%|█████▊    | 492/839 [00:23<00:23, 15.07it/s]Qunatization Process:  59%|█████▉    | 495/839 [00:23<00:25, 13.59it/s]Qunatization Process:  60%|█████▉    | 502/839 [00:23<00:16, 20.84it/s]Qunatization Process:  60%|██████    | 506/839 [00:24<00:14, 23.63it/s]Qunatization Process:  61%|██████    | 510/839 [00:24<00:12, 26.29it/s]Qunatization Process:  61%|██████▏   | 514/839 [00:24<00:11, 28.72it/s]Qunatization Process:  62%|██████▏   | 518/839 [00:24<00:21, 15.05it/s]Qunatization Process:  62%|██████▏   | 521/839 [00:25<00:23, 13.58it/s]Qunatization Process:  63%|██████▎   | 528/839 [00:25<00:14, 20.82it/s]Qunatization Process:  63%|██████▎   | 532/839 [00:25<00:13, 23.59it/s]Qunatization Process:  64%|██████▍   | 536/839 [00:25<00:11, 26.24it/s]Qunatization Process:  64%|██████▍   | 540/839 [00:25<00:10, 28.67it/s]Qunatization Process:  65%|██████▍   | 544/839 [00:26<00:19, 15.03it/s]Qunatization Process:  65%|██████▌   | 547/839 [00:26<00:21, 13.56it/s]Qunatization Process:  66%|██████▌   | 554/839 [00:26<00:13, 20.78it/s]Qunatization Process:  67%|██████▋   | 558/839 [00:26<00:11, 23.55it/s]Qunatization Process:  67%|██████▋   | 562/839 [00:26<00:10, 26.19it/s]Qunatization Process:  67%|██████▋   | 566/839 [00:26<00:09, 28.62it/s]Qunatization Process:  68%|██████▊   | 570/839 [00:27<00:17, 15.01it/s]Qunatization Process:  68%|██████▊   | 573/839 [00:27<00:19, 13.54it/s]Qunatization Process:  69%|██████▉   | 580/839 [00:27<00:12, 20.75it/s]Qunatization Process:  70%|██████▉   | 584/839 [00:27<00:10, 23.51it/s]Qunatization Process:  70%|███████   | 588/839 [00:28<00:09, 26.09it/s]Qunatization Process:  71%|███████   | 592/839 [00:28<00:08, 28.52it/s]Qunatization Process:  71%|███████   | 596/839 [00:28<00:16, 14.99it/s]Qunatization Process:  71%|███████▏  | 599/839 [00:29<00:17, 13.53it/s]Qunatization Process:  72%|███████▏  | 606/839 [00:29<00:11, 20.73it/s]Qunatization Process:  73%|███████▎  | 610/839 [00:29<00:09, 23.49it/s]Qunatization Process:  73%|███████▎  | 614/839 [00:29<00:08, 26.11it/s]Qunatization Process:  74%|███████▎  | 618/839 [00:29<00:07, 28.52it/s]Qunatization Process:  74%|███████▍  | 622/839 [00:30<00:14, 14.99it/s]Qunatization Process:  74%|███████▍  | 625/839 [00:30<00:15, 13.53it/s]Qunatization Process:  75%|███████▌  | 632/839 [00:30<00:09, 20.72it/s]Qunatization Process:  76%|███████▌  | 636/839 [00:30<00:08, 23.47it/s]Qunatization Process:  76%|███████▋  | 640/839 [00:30<00:07, 26.08it/s]Qunatization Process:  77%|███████▋  | 644/839 [00:30<00:06, 28.49it/s]Qunatization Process:  77%|███████▋  | 648/839 [00:31<00:12, 14.97it/s]Qunatization Process:  78%|███████▊  | 651/839 [00:31<00:13, 13.51it/s]Qunatization Process:  78%|███████▊  | 658/839 [00:31<00:08, 20.69it/s]Qunatization Process:  79%|███████▉  | 662/839 [00:31<00:07, 23.43it/s]Qunatization Process:  79%|███████▉  | 666/839 [00:31<00:06, 26.04it/s]Qunatization Process:  80%|███████▉  | 670/839 [00:32<00:05, 28.45it/s]Qunatization Process:  80%|████████  | 674/839 [00:32<00:11, 14.96it/s]Qunatization Process:  81%|████████  | 677/839 [00:32<00:11, 13.50it/s]Qunatization Process:  82%|████████▏ | 684/839 [00:33<00:07, 20.68it/s]Qunatization Process:  82%|████████▏ | 688/839 [00:33<00:06, 23.41it/s]Qunatization Process:  82%|████████▏ | 692/839 [00:33<00:05, 26.01it/s]Qunatization Process:  83%|████████▎ | 696/839 [00:33<00:05, 28.40it/s]Qunatization Process:  83%|████████▎ | 700/839 [00:33<00:09, 14.94it/s]Qunatization Process:  84%|████████▍ | 703/839 [00:34<00:10, 13.47it/s]Qunatization Process:  85%|████████▍ | 710/839 [00:34<00:06, 20.63it/s]Qunatization Process:  85%|████████▌ | 714/839 [00:34<00:05, 23.36it/s]Qunatization Process:  86%|████████▌ | 718/839 [00:34<00:04, 25.95it/s]Qunatization Process:  86%|████████▌ | 722/839 [00:34<00:04, 28.34it/s]Qunatization Process:  87%|████████▋ | 726/839 [00:35<00:07, 14.92it/s]Qunatization Process:  87%|████████▋ | 729/839 [00:35<00:08, 13.48it/s]Qunatization Process:  88%|████████▊ | 736/839 [00:35<00:04, 20.62it/s]Qunatization Process:  88%|████████▊ | 740/839 [00:35<00:04, 23.34it/s]Qunatization Process:  89%|████████▊ | 744/839 [00:35<00:03, 25.93it/s]Qunatization Process:  89%|████████▉ | 748/839 [00:36<00:03, 28.30it/s]Qunatization Process:  90%|████████▉ | 752/839 [00:36<00:05, 14.91it/s]Qunatization Process:  90%|████████▉ | 755/839 [00:36<00:06, 13.46it/s]Qunatization Process:  91%|█████████ | 762/839 [00:37<00:03, 20.59it/s]Qunatization Process:  91%|█████████▏| 766/839 [00:37<00:03, 23.30it/s]Qunatization Process:  92%|█████████▏| 770/839 [00:37<00:02, 25.87it/s]Qunatization Process:  92%|█████████▏| 774/839 [00:37<00:02, 28.24it/s]Qunatization Process:  93%|█████████▎| 778/839 [00:37<00:04, 14.88it/s]Qunatization Process:  93%|█████████▎| 781/839 [00:38<00:04, 13.44it/s]Qunatization Process:  94%|█████████▍| 788/839 [00:38<00:02, 20.56it/s]Qunatization Process:  94%|█████████▍| 792/839 [00:38<00:02, 23.27it/s]Qunatization Process:  95%|█████████▍| 796/839 [00:38<00:01, 25.83it/s]Qunatization Process:  95%|█████████▌| 800/839 [00:38<00:01, 28.18it/s]Qunatization Process:  96%|█████████▌| 804/839 [00:39<00:02, 14.87it/s]Qunatization Process:  96%|█████████▌| 807/839 [00:39<00:02, 13.43it/s]Qunatization Process:  97%|█████████▋| 814/839 [00:39<00:01, 20.54it/s]Qunatization Process:  97%|█████████▋| 818/839 [00:39<00:00, 23.24it/s]Qunatization Process:  98%|█████████▊| 822/839 [00:39<00:00, 25.80it/s]Qunatization Process:  98%|█████████▊| 826/839 [00:39<00:00, 28.15it/s]Qunatization Process:  99%|█████████▉| 830/839 [00:40<00:00, 14.84it/s]Qunatization Process:  99%|█████████▉| 833/839 [00:40<00:00, 13.40it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00,  9.84it/s]Qunatization Process: 100%|██████████| 839/839 [00:41<00:00, 20.12it/s]
+[04:49:53.704478] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-31): 32 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=4096, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=4096, bias=False)
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=11008, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (lora_a): RowParallelLinear(
+              (quanted_layer): Linear4bit(in_features=11008, out_features=16, bias=False)
+            )
+            (lora_b): Linear(in_features=16, out_features=4096, bias=False)
+            (quanted_layer): Linear4bit(in_features=11008, out_features=4096, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+            (lora_b): ColumnParallelLinear(
+              (quanted_layer): Linear4bit(in_features=16, out_features=11008, bias=False)
+            )
+            (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=4096, out_features=32000, bias=False)
+    )
+  )
+)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 7, which does not have an explicit index. FSDP will use the current device 7. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 2, which does not have an explicit index. FSDP will use the current device 2. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 6, which does not have an explicit index. FSDP will use the current device 6. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 3, which does not have an explicit index. FSDP will use the current device 3. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 4, which does not have an explicit index. FSDP will use the current device 4. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 5, which does not have an explicit index. FSDP will use the current device 5. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[04:49:55.921726] apply gradient checkpointing
+[04:49:55.970511] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-31): 32 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=4096, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=4096, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=4096, bias=False)
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=11008, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (lora_a): RowParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=11008, out_features=16, bias=False)
+                )
+                (lora_b): Linear(in_features=16, out_features=4096, bias=False)
+                (quanted_layer): Linear4bit(in_features=11008, out_features=4096, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (lora_a): Linear(in_features=4096, out_features=16, bias=False)
+                (lora_b): ColumnParallelLinear(
+                  (quanted_layer): Linear4bit(in_features=16, out_features=11008, bias=False)
+                )
+                (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=4096, out_features=32000, bias=False)
+      )
+    )
+  )
+)
+[04:49:55.970547] effective batch size: 64
+[04:49:55.975800] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[04:49:55.975995] read dataset config from configs/data/finetune/sg/alpaca.yaml
+[04:49:55.976708] DATASET CONFIG:
+[04:49:55.976724] {'META': [['../data/alpaca_gpt4_data.json', 'text']]}
+[04:49:56.119005] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[04:49:56.120914] total length: 52002
+[04:49:56.132147] <data.alpaca.FinetuneDataset object at 0x7fa76e110df0>
+[04:49:56.134049] Start training for 4 epochs
+[04:49:56.141961] log_dir: ./output_dir
+[04:50:00.993524] Epoch: [0]  [0/812]  lr: 0.000000  grad_norm: 1.6694 (1.6694)  closs: 1.1100 (1.1100)  time: 4.8504  data: 1.6775  max mem: 11767
+[04:50:13.467172] Epoch: [0]  [10/812]  lr: 0.000001  grad_norm: 1.7142 (1.6843)  closs: 1.1138 (1.1233)  time: 1.5749  data: 0.1527  max mem: 17666
+[04:50:25.645879] Epoch: [0]  [20/812]  lr: 0.000001  grad_norm: 1.6067 (1.6129)  closs: 1.0717 (1.1035)  time: 1.2325  data: 0.0002  max mem: 17666
+[04:50:37.815063] Epoch: [0]  [30/812]  lr: 0.000002  grad_norm: 1.6067 (1.6224)  closs: 1.1138 (1.1220)  time: 1.2173  data: 0.0002  max mem: 17666
+[04:50:50.097974] Epoch: [0]  [40/812]  lr: 0.000002  grad_norm: 1.5736 (1.5833)  closs: 1.1291 (1.1294)  time: 1.2225  data: 0.0001  max mem: 17666
+[04:51:02.303652] Epoch: [0]  [50/812]  lr: 0.000003  grad_norm: 1.3909 (1.5353)  closs: 1.1291 (1.1312)  time: 1.2244  data: 0.0002  max mem: 17666
+[04:51:14.652585] Epoch: [0]  [60/812]  lr: 0.000004  grad_norm: 1.2569 (1.4798)  closs: 1.1374 (1.1263)  time: 1.2277  data: 0.0002  max mem: 17666
+[04:51:26.911672] Epoch: [0]  [70/812]  lr: 0.000004  grad_norm: 1.1196 (1.4223)  closs: 1.0760 (1.1289)  time: 1.2303  data: 0.0002  max mem: 17666
+[04:51:39.110388] Epoch: [0]  [80/812]  lr: 0.000005  grad_norm: 0.9717 (1.3646)  closs: 1.0850 (1.1260)  time: 1.2228  data: 0.0002  max mem: 17666
+[04:51:51.365652] Epoch: [0]  [90/812]  lr: 0.000006  grad_norm: 0.8888 (1.3056)  closs: 1.0433 (1.1139)  time: 1.2226  data: 0.0002  max mem: 17666
+[04:52:03.587350] Epoch: [0]  [100/812]  lr: 0.000006  grad_norm: 0.7781 (1.2548)  closs: 1.0227 (1.1095)  time: 1.2238  data: 0.0002  max mem: 17666
+[04:52:15.847329] Epoch: [0]  [110/812]  lr: 0.000007  grad_norm: 0.7166 (1.2090)  closs: 1.0220 (1.0972)  time: 1.2240  data: 0.0002  max mem: 17666
+[04:52:28.053228] Epoch: [0]  [120/812]  lr: 0.000007  grad_norm: 0.7216 (1.1685)  closs: 1.0220 (1.0963)  time: 1.2232  data: 0.0002  max mem: 17666
+[04:52:40.306358] Epoch: [0]  [130/812]  lr: 0.000008  grad_norm: 0.7216 (1.1348)  closs: 1.0442 (1.0918)  time: 1.2229  data: 0.0002  max mem: 17666
+[04:52:52.564879] Epoch: [0]  [140/812]  lr: 0.000009  grad_norm: 0.6922 (1.1040)  closs: 0.9522 (1.0824)  time: 1.2255  data: 0.0002  max mem: 17666
+[04:53:04.777107] Epoch: [0]  [150/812]  lr: 0.000009  grad_norm: 0.6689 (1.0747)  closs: 0.9576 (1.0768)  time: 1.2235  data: 0.0002  max mem: 17666
+[04:53:17.031527] Epoch: [0]  [160/812]  lr: 0.000010  grad_norm: 0.6162 (1.0461)  closs: 0.9987 (1.0734)  time: 1.2233  data: 0.0002  max mem: 17666
+[04:53:29.214924] Epoch: [0]  [170/812]  lr: 0.000010  grad_norm: 0.6162 (1.0222)  closs: 0.9767 (1.0659)  time: 1.2218  data: 0.0002  max mem: 17666
+[04:53:41.435558] Epoch: [0]  [180/812]  lr: 0.000011  grad_norm: 0.6274 (0.9998)  closs: 0.9133 (1.0574)  time: 1.2201  data: 0.0002  max mem: 17666
+[04:53:53.641153] Epoch: [0]  [190/812]  lr: 0.000012  grad_norm: 0.5874 (0.9789)  closs: 0.9174 (1.0514)  time: 1.2212  data: 0.0002  max mem: 17666
+[04:54:05.868672] Epoch: [0]  [200/812]  lr: 0.000012  grad_norm: 0.5874 (0.9605)  closs: 0.9700 (1.0485)  time: 1.2216  data: 0.0002  max mem: 17666
+[04:54:18.107142] Epoch: [0]  [210/812]  lr: 0.000013  grad_norm: 0.5872 (0.9425)  closs: 0.9847 (1.0443)  time: 1.2232  data: 0.0002  max mem: 17666
+[04:54:30.330679] Epoch: [0]  [220/812]  lr: 0.000014  grad_norm: 0.5699 (0.9259)  closs: 0.9661 (1.0414)  time: 1.2230  data: 0.0001  max mem: 17666
+[04:54:42.605734] Epoch: [0]  [230/812]  lr: 0.000014  grad_norm: 0.5536 (0.9112)  closs: 0.9191 (1.0351)  time: 1.2249  data: 0.0001  max mem: 17666
+[04:54:54.840846] Epoch: [0]  [240/812]  lr: 0.000015  grad_norm: 0.5524 (0.8976)  closs: 0.9078 (1.0315)  time: 1.2254  data: 0.0002  max mem: 17666
+[04:55:07.114906] Epoch: [0]  [250/812]  lr: 0.000015  grad_norm: 0.5982 (0.8887)  closs: 0.9491 (1.0288)  time: 1.2254  data: 0.0002  max mem: 17666
+[04:55:19.357896] Epoch: [0]  [260/812]  lr: 0.000016  grad_norm: 0.5592 (0.8752)  closs: 0.9431 (1.0253)  time: 1.2258  data: 0.0002  max mem: 17666
+[04:55:31.788387] Epoch: [0]  [270/812]  lr: 0.000017  grad_norm: 0.5482 (0.8643)  closs: 0.9608 (1.0244)  time: 1.2336  data: 0.0002  max mem: 17666
+[04:55:44.053664] Epoch: [0]  [280/812]  lr: 0.000017  grad_norm: 0.5581 (0.8548)  closs: 0.9608 (1.0214)  time: 1.2347  data: 0.0001  max mem: 17666
+[04:55:56.289359] Epoch: [0]  [290/812]  lr: 0.000018  grad_norm: 0.5606 (0.8459)  closs: 0.9531 (1.0183)  time: 1.2250  data: 0.0001  max mem: 17666
+[04:56:08.574032] Epoch: [0]  [300/812]  lr: 0.000018  grad_norm: 0.5730 (0.8372)  closs: 0.9457 (1.0158)  time: 1.2260  data: 0.0001  max mem: 17666
+[04:56:20.819959] Epoch: [0]  [310/812]  lr: 0.000019  grad_norm: 0.5730 (0.8287)  closs: 0.9613 (1.0152)  time: 1.2265  data: 0.0002  max mem: 17666
+[04:56:33.077045] Epoch: [0]  [320/812]  lr: 0.000020  grad_norm: 0.5720 (0.8213)  closs: 0.9613 (1.0149)  time: 1.2251  data: 0.0002  max mem: 17666
+[04:56:45.304449] Epoch: [0]  [330/812]  lr: 0.000020  grad_norm: 0.5427 (0.8132)  closs: 0.9252 (1.0109)  time: 1.2241  data: 0.0002  max mem: 17666
+[04:56:57.563745] Epoch: [0]  [340/812]  lr: 0.000021  grad_norm: 0.5427 (0.8059)  closs: 0.9062 (1.0078)  time: 1.2242  data: 0.0002  max mem: 17666
+[04:57:09.823556] Epoch: [0]  [350/812]  lr: 0.000022  grad_norm: 0.5474 (0.7994)  closs: 0.9406 (1.0088)  time: 1.2259  data: 0.0002  max mem: 17666
+[04:57:22.044133] Epoch: [0]  [360/812]  lr: 0.000022  grad_norm: 0.5634 (0.7934)  closs: 0.9744 (1.0071)  time: 1.2240  data: 0.0002  max mem: 17666
+[04:57:34.303848] Epoch: [0]  [370/812]  lr: 0.000023  grad_norm: 0.5715 (0.7872)  closs: 0.9075 (1.0046)  time: 1.2239  data: 0.0002  max mem: 17666
+[04:57:46.512890] Epoch: [0]  [380/812]  lr: 0.000023  grad_norm: 0.5607 (0.7815)  closs: 0.9075 (1.0032)  time: 1.2234  data: 0.0002  max mem: 17666
+[04:57:58.775654] Epoch: [0]  [390/812]  lr: 0.000024  grad_norm: 0.5731 (0.7765)  closs: 0.9216 (1.0017)  time: 1.2235  data: 0.0002  max mem: 17666
+[04:58:11.005032] Epoch: [0]  [400/812]  lr: 0.000025  grad_norm: 0.5846 (0.7721)  closs: 0.9061 (1.0003)  time: 1.2245  data: 0.0002  max mem: 17666
+[04:58:23.252387] Epoch: [0]  [410/812]  lr: 0.000025  grad_norm: 0.5909 (0.7688)  closs: 0.9044 (0.9990)  time: 1.2238  data: 0.0002  max mem: 17666
+[04:58:35.540811] Epoch: [0]  [420/812]  lr: 0.000026  grad_norm: 0.5704 (0.7638)  closs: 0.8909 (0.9968)  time: 1.2267  data: 0.0002  max mem: 17666
+[04:58:47.754683] Epoch: [0]  [430/812]  lr: 0.000026  grad_norm: 0.5641 (0.7602)  closs: 0.8689 (0.9940)  time: 1.2250  data: 0.0002  max mem: 17666
+[04:59:00.034244] Epoch: [0]  [440/812]  lr: 0.000027  grad_norm: 0.5588 (0.7557)  closs: 0.8496 (0.9914)  time: 1.2246  data: 0.0002  max mem: 17666
+[04:59:12.243723] Epoch: [0]  [450/812]  lr: 0.000028  grad_norm: 0.5588 (0.7518)  closs: 0.8680 (0.9887)  time: 1.2244  data: 0.0002  max mem: 17666
+[04:59:24.510184] Epoch: [0]  [460/812]  lr: 0.000028  grad_norm: 0.5679 (0.7476)  closs: 0.8964 (0.9874)  time: 1.2237  data: 0.0002  max mem: 17666
+[04:59:36.736038] Epoch: [0]  [470/812]  lr: 0.000029  grad_norm: 0.5676 (0.7444)  closs: 0.9041 (0.9869)  time: 1.2245  data: 0.0002  max mem: 17666
+[04:59:48.991058] Epoch: [0]  [480/812]  lr: 0.000030  grad_norm: 0.5570 (0.7411)  closs: 0.9610 (0.9865)  time: 1.2240  data: 0.0002  max mem: 17666
+[05:00:01.241427] Epoch: [0]  [490/812]  lr: 0.000030  grad_norm: 0.5518 (0.7368)  closs: 0.9518 (0.9857)  time: 1.2252  data: 0.0002  max mem: 17666
+[05:00:13.456707] Epoch: [0]  [500/812]  lr: 0.000031  grad_norm: 0.5348 (0.7332)  closs: 0.9462 (0.9844)  time: 1.2232  data: 0.0002  max mem: 17666
+[05:00:25.704700] Epoch: [0]  [510/812]  lr: 0.000031  grad_norm: 0.5677 (0.7322)  closs: 0.9125 (0.9823)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:00:37.923520] Epoch: [0]  [520/812]  lr: 0.000032  grad_norm: 0.5883 (0.7326)  closs: 0.8610 (0.9811)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:00:50.181172] Epoch: [0]  [530/812]  lr: 0.000033  grad_norm: 0.5724 (0.7299)  closs: 0.9259 (0.9802)  time: 1.2238  data: 0.0002  max mem: 17666
+[05:01:02.403659] Epoch: [0]  [540/812]  lr: 0.000033  grad_norm: 0.5683 (0.7269)  closs: 0.9560 (0.9795)  time: 1.2239  data: 0.0002  max mem: 17666
+[05:01:14.663952] Epoch: [0]  [550/812]  lr: 0.000034  grad_norm: 0.5575 (0.7241)  closs: 0.9308 (0.9781)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:01:26.950895] Epoch: [0]  [560/812]  lr: 0.000034  grad_norm: 0.5575 (0.7212)  closs: 0.8756 (0.9775)  time: 1.2273  data: 0.0003  max mem: 17666
+[05:01:39.165892] Epoch: [0]  [570/812]  lr: 0.000035  grad_norm: 0.5441 (0.7187)  closs: 0.9025 (0.9766)  time: 1.2250  data: 0.0003  max mem: 17666
+[05:01:51.405903] Epoch: [0]  [580/812]  lr: 0.000036  grad_norm: 0.5507 (0.7162)  closs: 0.9051 (0.9764)  time: 1.2227  data: 0.0002  max mem: 17666
+[05:02:03.610466] Epoch: [0]  [590/812]  lr: 0.000036  grad_norm: 0.5490 (0.7134)  closs: 0.9051 (0.9755)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:02:15.849292] Epoch: [0]  [600/812]  lr: 0.000037  grad_norm: 0.5563 (0.7107)  closs: 0.9230 (0.9747)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:02:28.073108] Epoch: [0]  [610/812]  lr: 0.000038  grad_norm: 0.5737 (0.7087)  closs: 0.8677 (0.9726)  time: 1.2231  data: 0.0003  max mem: 17666
+[05:02:40.301764] Epoch: [0]  [620/812]  lr: 0.000038  grad_norm: 0.5858 (0.7066)  closs: 0.8455 (0.9718)  time: 1.2226  data: 0.0003  max mem: 17666
+[05:02:52.540493] Epoch: [0]  [630/812]  lr: 0.000039  grad_norm: 0.5850 (0.7049)  closs: 0.8993 (0.9707)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:03:04.773003] Epoch: [0]  [640/812]  lr: 0.000039  grad_norm: 0.5764 (0.7030)  closs: 0.9169 (0.9703)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:03:17.077033] Epoch: [0]  [650/812]  lr: 0.000040  grad_norm: 0.5656 (0.7012)  closs: 0.9087 (0.9689)  time: 1.2268  data: 0.0002  max mem: 17666
+[05:03:29.323502] Epoch: [0]  [660/812]  lr: 0.000041  grad_norm: 0.5623 (0.6993)  closs: 0.8749 (0.9684)  time: 1.2274  data: 0.0002  max mem: 17666
+[05:03:41.591619] Epoch: [0]  [670/812]  lr: 0.000041  grad_norm: 0.5259 (0.6967)  closs: 0.8953 (0.9678)  time: 1.2257  data: 0.0002  max mem: 17666
+[05:03:53.817646] Epoch: [0]  [680/812]  lr: 0.000042  grad_norm: 0.5327 (0.6945)  closs: 0.9142 (0.9673)  time: 1.2246  data: 0.0002  max mem: 17666
+[05:04:06.075349] Epoch: [0]  [690/812]  lr: 0.000042  grad_norm: 0.5405 (0.6927)  closs: 0.9243 (0.9669)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:04:18.348610] Epoch: [0]  [700/812]  lr: 0.000043  grad_norm: 0.5495 (0.6909)  closs: 0.9243 (0.9666)  time: 1.2265  data: 0.0002  max mem: 17666
+[05:04:30.558726] Epoch: [0]  [710/812]  lr: 0.000044  grad_norm: 0.5495 (0.6890)  closs: 0.8956 (0.9654)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:04:42.827196] Epoch: [0]  [720/812]  lr: 0.000044  grad_norm: 0.5697 (0.6876)  closs: 0.8870 (0.9649)  time: 1.2239  data: 0.0002  max mem: 17666
+[05:04:55.071487] Epoch: [0]  [730/812]  lr: 0.000045  grad_norm: 0.5697 (0.6858)  closs: 0.8934 (0.9640)  time: 1.2256  data: 0.0002  max mem: 17666
+[05:05:07.340719] Epoch: [0]  [740/812]  lr: 0.000046  grad_norm: 0.5596 (0.6840)  closs: 0.8871 (0.9627)  time: 1.2256  data: 0.0002  max mem: 17666
+[05:05:19.598714] Epoch: [0]  [750/812]  lr: 0.000046  grad_norm: 0.5451 (0.6823)  closs: 0.9205 (0.9636)  time: 1.2263  data: 0.0002  max mem: 17666
+[05:05:31.859047] Epoch: [0]  [760/812]  lr: 0.000047  grad_norm: 0.5443 (0.6806)  closs: 0.9429 (0.9627)  time: 1.2258  data: 0.0002  max mem: 17666
+[05:05:44.113014] Epoch: [0]  [770/812]  lr: 0.000047  grad_norm: 0.5571 (0.6793)  closs: 0.9180 (0.9618)  time: 1.2256  data: 0.0002  max mem: 17666
+[05:05:56.310853] Epoch: [0]  [780/812]  lr: 0.000048  grad_norm: 0.5778 (0.6778)  closs: 0.9180 (0.9614)  time: 1.2225  data: 0.0001  max mem: 17666
+[05:06:08.553058] Epoch: [0]  [790/812]  lr: 0.000049  grad_norm: 0.5687 (0.6765)  closs: 0.8819 (0.9604)  time: 1.2219  data: 0.0001  max mem: 17666
+[05:06:20.746249] Epoch: [0]  [800/812]  lr: 0.000049  grad_norm: 0.5436 (0.6748)  closs: 0.8819 (0.9599)  time: 1.2217  data: 0.0001  max mem: 17666
+[05:06:33.006020] Epoch: [0]  [810/812]  lr: 0.000050  grad_norm: 0.5421 (0.6735)  closs: 0.8738 (0.9586)  time: 1.2226  data: 0.0001  max mem: 17666
+[05:06:34.434061] Epoch: [0] Total time: 0:16:38
+[05:06:34.435967] Averaged stats: lr: 0.000050  grad_norm: 0.5358 (0.6733)  closs: 0.8738 (0.9640)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[05:06:34.685369] model saved
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[05:06:36.004693] optimizer saved
+[05:06:36.005298] other rank-common saved
+[05:06:36.008307] rank-specific saved
+[05:06:36.016492] log_dir: ./output_dir
+[05:06:38.409445] Epoch: [1]  [0/812]  lr: 0.000050  grad_norm: 0.5716 (0.5716)  closs: 0.7068 (0.7068)  time: 2.3920  data: 1.1581  max mem: 17666
+[05:06:50.657381] Epoch: [1]  [10/812]  lr: 0.000050  grad_norm: 0.5716 (0.5743)  closs: 0.8450 (0.8412)  time: 1.3308  data: 0.1054  max mem: 17666
+[05:07:02.826034] Epoch: [1]  [20/812]  lr: 0.000050  grad_norm: 0.5748 (0.5730)  closs: 0.8714 (0.9120)  time: 1.2208  data: 0.0001  max mem: 17666
+[05:07:14.934821] Epoch: [1]  [30/812]  lr: 0.000050  grad_norm: 0.5622 (0.5677)  closs: 0.9029 (0.9080)  time: 1.2138  data: 0.0001  max mem: 17666
+[05:07:27.078058] Epoch: [1]  [40/812]  lr: 0.000050  grad_norm: 0.5479 (0.5654)  closs: 0.9393 (0.9264)  time: 1.2125  data: 0.0001  max mem: 17666
+[05:07:39.214844] Epoch: [1]  [50/812]  lr: 0.000050  grad_norm: 0.5380 (0.5637)  closs: 0.9393 (0.9220)  time: 1.2139  data: 0.0001  max mem: 17666
+[05:07:51.352136] Epoch: [1]  [60/812]  lr: 0.000050  grad_norm: 0.5607 (0.5649)  closs: 0.9001 (0.9198)  time: 1.2136  data: 0.0001  max mem: 17666
+[05:08:03.517114] Epoch: [1]  [70/812]  lr: 0.000050  grad_norm: 0.5700 (0.5683)  closs: 0.9085 (0.9224)  time: 1.2150  data: 0.0001  max mem: 17666
+[05:08:15.729009] Epoch: [1]  [80/812]  lr: 0.000050  grad_norm: 0.5498 (0.5672)  closs: 0.9337 (0.9186)  time: 1.2188  data: 0.0001  max mem: 17666
+[05:08:27.942413] Epoch: [1]  [90/812]  lr: 0.000050  grad_norm: 0.5372 (0.5618)  closs: 0.9337 (0.9233)  time: 1.2212  data: 0.0002  max mem: 17666
+[05:08:40.177210] Epoch: [1]  [100/812]  lr: 0.000050  grad_norm: 0.5399 (0.5635)  closs: 0.9525 (0.9257)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:08:52.395851] Epoch: [1]  [110/812]  lr: 0.000050  grad_norm: 0.5470 (0.5630)  closs: 0.9445 (0.9224)  time: 1.2226  data: 0.0002  max mem: 17666
+[05:09:04.620910] Epoch: [1]  [120/812]  lr: 0.000050  grad_norm: 0.5660 (0.5649)  closs: 0.9445 (0.9235)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:09:16.823910] Epoch: [1]  [130/812]  lr: 0.000050  grad_norm: 0.5666 (0.5642)  closs: 0.9266 (0.9232)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:09:29.044814] Epoch: [1]  [140/812]  lr: 0.000050  grad_norm: 0.5469 (0.5636)  closs: 0.9191 (0.9222)  time: 1.2211  data: 0.0002  max mem: 17666
+[05:09:41.250901] Epoch: [1]  [150/812]  lr: 0.000050  grad_norm: 0.5427 (0.5631)  closs: 0.9318 (0.9220)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:09:53.461830] Epoch: [1]  [160/812]  lr: 0.000050  grad_norm: 0.5499 (0.5632)  closs: 0.9413 (0.9241)  time: 1.2208  data: 0.0002  max mem: 17666
+[05:10:05.704359] Epoch: [1]  [170/812]  lr: 0.000049  grad_norm: 0.5499 (0.5628)  closs: 0.9694 (0.9263)  time: 1.2226  data: 0.0002  max mem: 17666
+[05:10:17.913859] Epoch: [1]  [180/812]  lr: 0.000049  grad_norm: 0.5187 (0.5618)  closs: 0.9341 (0.9267)  time: 1.2225  data: 0.0002  max mem: 17666
+[05:10:30.152053] Epoch: [1]  [190/812]  lr: 0.000049  grad_norm: 0.5289 (0.5600)  closs: 0.8866 (0.9248)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:10:42.370346] Epoch: [1]  [200/812]  lr: 0.000049  grad_norm: 0.5296 (0.5589)  closs: 0.8866 (0.9232)  time: 1.2228  data: 0.0002  max mem: 17666
+[05:10:54.621739] Epoch: [1]  [210/812]  lr: 0.000049  grad_norm: 0.5406 (0.5576)  closs: 0.9429 (0.9253)  time: 1.2234  data: 0.0002  max mem: 17666
+[05:11:06.854883] Epoch: [1]  [220/812]  lr: 0.000049  grad_norm: 0.5362 (0.5573)  closs: 0.9438 (0.9241)  time: 1.2242  data: 0.0002  max mem: 17666
+[05:11:19.064849] Epoch: [1]  [230/812]  lr: 0.000049  grad_norm: 0.5260 (0.5562)  closs: 0.8665 (0.9228)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:11:31.300169] Epoch: [1]  [240/812]  lr: 0.000049  grad_norm: 0.5371 (0.5563)  closs: 0.8964 (0.9237)  time: 1.2222  data: 0.0001  max mem: 17666
+[05:11:43.496321] Epoch: [1]  [250/812]  lr: 0.000049  grad_norm: 0.5503 (0.5565)  closs: 0.9451 (0.9228)  time: 1.2215  data: 0.0001  max mem: 17666
+[05:11:55.764297] Epoch: [1]  [260/812]  lr: 0.000049  grad_norm: 0.5451 (0.5569)  closs: 0.9342 (0.9233)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:12:07.971728] Epoch: [1]  [270/812]  lr: 0.000049  grad_norm: 0.5451 (0.5568)  closs: 0.9317 (0.9219)  time: 1.2237  data: 0.0002  max mem: 17666
+[05:12:20.187157] Epoch: [1]  [280/812]  lr: 0.000049  grad_norm: 0.5521 (0.5574)  closs: 0.9182 (0.9220)  time: 1.2211  data: 0.0002  max mem: 17666
+[05:12:32.426547] Epoch: [1]  [290/812]  lr: 0.000048  grad_norm: 0.5328 (0.5572)  closs: 0.9268 (0.9225)  time: 1.2227  data: 0.0002  max mem: 17666
+[05:12:44.641825] Epoch: [1]  [300/812]  lr: 0.000048  grad_norm: 0.5328 (0.5566)  closs: 0.8957 (0.9212)  time: 1.2227  data: 0.0002  max mem: 17666
+[05:12:56.872549] Epoch: [1]  [310/812]  lr: 0.000048  grad_norm: 0.5379 (0.5574)  closs: 0.8875 (0.9203)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:13:09.087618] Epoch: [1]  [320/812]  lr: 0.000048  grad_norm: 0.5465 (0.5576)  closs: 0.8875 (0.9207)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:13:21.321873] Epoch: [1]  [330/812]  lr: 0.000048  grad_norm: 0.5207 (0.5569)  closs: 0.8802 (0.9200)  time: 1.2224  data: 0.0002  max mem: 17666
+[05:13:33.555776] Epoch: [1]  [340/812]  lr: 0.000048  grad_norm: 0.5206 (0.5569)  closs: 0.8556 (0.9179)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:13:45.822566] Epoch: [1]  [350/812]  lr: 0.000048  grad_norm: 0.5441 (0.5565)  closs: 0.8780 (0.9175)  time: 1.2250  data: 0.0001  max mem: 17666
+[05:13:58.058935] Epoch: [1]  [360/812]  lr: 0.000048  grad_norm: 0.5441 (0.5563)  closs: 0.8524 (0.9153)  time: 1.2251  data: 0.0001  max mem: 17666
+[05:14:10.306817] Epoch: [1]  [370/812]  lr: 0.000047  grad_norm: 0.5542 (0.5571)  closs: 0.8262 (0.9143)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:14:22.557391] Epoch: [1]  [380/812]  lr: 0.000047  grad_norm: 0.5502 (0.5566)  closs: 0.8911 (0.9145)  time: 1.2249  data: 0.0001  max mem: 17666
+[05:14:34.775480] Epoch: [1]  [390/812]  lr: 0.000047  grad_norm: 0.5379 (0.5564)  closs: 0.9057 (0.9150)  time: 1.2234  data: 0.0001  max mem: 17666
+[05:14:47.019726] Epoch: [1]  [400/812]  lr: 0.000047  grad_norm: 0.5379 (0.5560)  closs: 0.9057 (0.9147)  time: 1.2231  data: 0.0001  max mem: 17666
+[05:14:59.254412] Epoch: [1]  [410/812]  lr: 0.000047  grad_norm: 0.5308 (0.5556)  closs: 0.9153 (0.9160)  time: 1.2239  data: 0.0001  max mem: 17666
+[05:15:11.483844] Epoch: [1]  [420/812]  lr: 0.000047  grad_norm: 0.5308 (0.5551)  closs: 0.9592 (0.9165)  time: 1.2231  data: 0.0001  max mem: 17666
+[05:15:23.696028] Epoch: [1]  [430/812]  lr: 0.000047  grad_norm: 0.5458 (0.5561)  closs: 0.8909 (0.9157)  time: 1.2220  data: 0.0002  max mem: 17666
+[05:15:35.910601] Epoch: [1]  [440/812]  lr: 0.000046  grad_norm: 0.5484 (0.5561)  closs: 0.9059 (0.9157)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:15:48.141787] Epoch: [1]  [450/812]  lr: 0.000046  grad_norm: 0.5484 (0.5563)  closs: 0.9138 (0.9153)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:16:00.351582] Epoch: [1]  [460/812]  lr: 0.000046  grad_norm: 0.5362 (0.5557)  closs: 0.8589 (0.9147)  time: 1.2220  data: 0.0002  max mem: 17666
+[05:16:12.585800] Epoch: [1]  [470/812]  lr: 0.000046  grad_norm: 0.5292 (0.5555)  closs: 0.8808 (0.9147)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:16:24.813078] Epoch: [1]  [480/812]  lr: 0.000046  grad_norm: 0.5494 (0.5564)  closs: 0.8981 (0.9153)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:16:37.044404] Epoch: [1]  [490/812]  lr: 0.000046  grad_norm: 0.5662 (0.5563)  closs: 0.9437 (0.9160)  time: 1.2229  data: 0.0002  max mem: 17666
+[05:16:49.306306] Epoch: [1]  [500/812]  lr: 0.000045  grad_norm: 0.5418 (0.5562)  closs: 0.9002 (0.9156)  time: 1.2246  data: 0.0002  max mem: 17666
+[05:17:01.504233] Epoch: [1]  [510/812]  lr: 0.000045  grad_norm: 0.5569 (0.5596)  closs: 0.8913 (0.9152)  time: 1.2229  data: 0.0001  max mem: 17666
+[05:17:13.732894] Epoch: [1]  [520/812]  lr: 0.000045  grad_norm: 0.5569 (0.5594)  closs: 0.9053 (0.9152)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:17:25.950468] Epoch: [1]  [530/812]  lr: 0.000045  grad_norm: 0.5472 (0.5591)  closs: 0.8929 (0.9153)  time: 1.2222  data: 0.0001  max mem: 17666
+[05:17:38.187123] Epoch: [1]  [540/812]  lr: 0.000045  grad_norm: 0.5325 (0.5590)  closs: 0.8836 (0.9154)  time: 1.2226  data: 0.0001  max mem: 17666
+[05:17:50.416363] Epoch: [1]  [550/812]  lr: 0.000045  grad_norm: 0.5393 (0.5585)  closs: 0.8734 (0.9148)  time: 1.2232  data: 0.0002  max mem: 17666
+[05:18:02.631787] Epoch: [1]  [560/812]  lr: 0.000044  grad_norm: 0.5416 (0.5586)  closs: 0.9068 (0.9150)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:18:14.860124] Epoch: [1]  [570/812]  lr: 0.000044  grad_norm: 0.5634 (0.5589)  closs: 0.9202 (0.9154)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:18:27.083101] Epoch: [1]  [580/812]  lr: 0.000044  grad_norm: 0.5520 (0.5588)  closs: 0.9180 (0.9145)  time: 1.2225  data: 0.0002  max mem: 17666
+[05:18:39.321277] Epoch: [1]  [590/812]  lr: 0.000044  grad_norm: 0.5561 (0.5592)  closs: 0.9180 (0.9147)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:18:51.538167] Epoch: [1]  [600/812]  lr: 0.000044  grad_norm: 0.5392 (0.5588)  closs: 0.9240 (0.9150)  time: 1.2227  data: 0.0002  max mem: 17666
+[05:19:03.786880] Epoch: [1]  [610/812]  lr: 0.000043  grad_norm: 0.5296 (0.5585)  closs: 0.8944 (0.9151)  time: 1.2232  data: 0.0001  max mem: 17666
+[05:19:16.029554] Epoch: [1]  [620/812]  lr: 0.000043  grad_norm: 0.5438 (0.5580)  closs: 0.9104 (0.9152)  time: 1.2245  data: 0.0002  max mem: 17666
+[05:19:28.287030] Epoch: [1]  [630/812]  lr: 0.000043  grad_norm: 0.5312 (0.5574)  closs: 0.9339 (0.9155)  time: 1.2249  data: 0.0002  max mem: 17666
+[05:19:40.534557] Epoch: [1]  [640/812]  lr: 0.000043  grad_norm: 0.5312 (0.5572)  closs: 0.9009 (0.9155)  time: 1.2252  data: 0.0002  max mem: 17666
+[05:19:52.747120] Epoch: [1]  [650/812]  lr: 0.000043  grad_norm: 0.5282 (0.5566)  closs: 0.8875 (0.9153)  time: 1.2229  data: 0.0001  max mem: 17666
+[05:20:04.986132] Epoch: [1]  [660/812]  lr: 0.000042  grad_norm: 0.5202 (0.5564)  closs: 0.9420 (0.9163)  time: 1.2225  data: 0.0001  max mem: 17666
+[05:20:17.207197] Epoch: [1]  [670/812]  lr: 0.000042  grad_norm: 0.5224 (0.5563)  closs: 0.9829 (0.9170)  time: 1.2229  data: 0.0002  max mem: 17666
+[05:20:29.408267] Epoch: [1]  [680/812]  lr: 0.000042  grad_norm: 0.5301 (0.5566)  closs: 0.9759 (0.9177)  time: 1.2210  data: 0.0002  max mem: 17666
+[05:20:41.615333] Epoch: [1]  [690/812]  lr: 0.000042  grad_norm: 0.5389 (0.5570)  closs: 0.9759 (0.9181)  time: 1.2203  data: 0.0002  max mem: 17666
+[05:20:53.843553] Epoch: [1]  [700/812]  lr: 0.000041  grad_norm: 0.5477 (0.5568)  closs: 0.9354 (0.9181)  time: 1.2217  data: 0.0002  max mem: 17666
+[05:21:06.076892] Epoch: [1]  [710/812]  lr: 0.000041  grad_norm: 0.5477 (0.5567)  closs: 0.8810 (0.9178)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:21:18.276846] Epoch: [1]  [720/812]  lr: 0.000041  grad_norm: 0.5319 (0.5565)  closs: 0.8641 (0.9173)  time: 1.2216  data: 0.0001  max mem: 17666
+[05:21:30.499197] Epoch: [1]  [730/812]  lr: 0.000041  grad_norm: 0.5319 (0.5562)  closs: 0.9143 (0.9173)  time: 1.2211  data: 0.0001  max mem: 17666
+[05:21:42.697532] Epoch: [1]  [740/812]  lr: 0.000041  grad_norm: 0.5363 (0.5563)  closs: 0.9228 (0.9173)  time: 1.2210  data: 0.0001  max mem: 17666
+[05:21:54.912903] Epoch: [1]  [750/812]  lr: 0.000040  grad_norm: 0.5423 (0.5559)  closs: 0.8785 (0.9170)  time: 1.2206  data: 0.0001  max mem: 17666
+[05:22:07.135892] Epoch: [1]  [760/812]  lr: 0.000040  grad_norm: 0.5280 (0.5560)  closs: 0.8854 (0.9174)  time: 1.2219  data: 0.0002  max mem: 17666
+[05:22:19.333692] Epoch: [1]  [770/812]  lr: 0.000040  grad_norm: 0.5221 (0.5555)  closs: 0.9338 (0.9169)  time: 1.2210  data: 0.0001  max mem: 17666
+[05:22:31.551464] Epoch: [1]  [780/812]  lr: 0.000040  grad_norm: 0.5360 (0.5559)  closs: 0.8506 (0.9166)  time: 1.2207  data: 0.0001  max mem: 17666
+[05:22:43.738476] Epoch: [1]  [790/812]  lr: 0.000039  grad_norm: 0.5617 (0.5560)  closs: 0.8934 (0.9169)  time: 1.2202  data: 0.0001  max mem: 17666
+[05:22:55.936330] Epoch: [1]  [800/812]  lr: 0.000039  grad_norm: 0.5488 (0.5560)  closs: 0.9056 (0.9165)  time: 1.2192  data: 0.0001  max mem: 17666
+[05:23:08.115449] Epoch: [1]  [810/812]  lr: 0.000039  grad_norm: 0.5498 (0.5563)  closs: 0.9065 (0.9165)  time: 1.2188  data: 0.0001  max mem: 17666
+[05:23:09.610496] Epoch: [1] Total time: 0:16:33
+[05:23:09.633077] Averaged stats: lr: 0.000039  grad_norm: 0.5498 (0.5562)  closs: 0.9187 (0.9137)
+[05:23:09.936052] model saved
+[05:23:11.238391] optimizer saved
+[05:23:11.238903] other rank-common saved
+[05:23:11.241868] rank-specific saved
+[05:23:11.249988] log_dir: ./output_dir
+[05:23:13.663887] Epoch: [2]  [0/812]  lr: 0.000039  grad_norm: 0.7125 (0.7125)  closs: 0.8745 (0.8745)  time: 2.4131  data: 1.1672  max mem: 17666
+[05:23:25.915941] Epoch: [2]  [10/812]  lr: 0.000038  grad_norm: 0.5159 (0.5531)  closs: 0.8802 (0.8976)  time: 1.3331  data: 0.1062  max mem: 17666
+[05:23:38.174870] Epoch: [2]  [20/812]  lr: 0.000038  grad_norm: 0.5206 (0.5430)  closs: 0.8802 (0.8902)  time: 1.2255  data: 0.0002  max mem: 17666
+[05:23:50.412310] Epoch: [2]  [30/812]  lr: 0.000038  grad_norm: 0.5260 (0.5434)  closs: 0.8835 (0.8971)  time: 1.2247  data: 0.0002  max mem: 17666
+[05:24:02.710838] Epoch: [2]  [40/812]  lr: 0.000038  grad_norm: 0.5309 (0.5425)  closs: 0.8968 (0.8971)  time: 1.2267  data: 0.0002  max mem: 17666
+[05:24:15.054535] Epoch: [2]  [50/812]  lr: 0.000037  grad_norm: 0.5321 (0.5424)  closs: 0.9001 (0.8951)  time: 1.2320  data: 0.0004  max mem: 17666
+[05:24:27.318243] Epoch: [2]  [60/812]  lr: 0.000037  grad_norm: 0.5432 (0.5419)  closs: 0.9220 (0.8985)  time: 1.2303  data: 0.0004  max mem: 17666
+[05:24:39.554150] Epoch: [2]  [70/812]  lr: 0.000037  grad_norm: 0.5432 (0.5457)  closs: 0.9125 (0.9035)  time: 1.2249  data: 0.0002  max mem: 17666
+[05:24:51.781126] Epoch: [2]  [80/812]  lr: 0.000037  grad_norm: 0.5337 (0.5448)  closs: 0.8991 (0.9013)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:25:03.998186] Epoch: [2]  [90/812]  lr: 0.000036  grad_norm: 0.5328 (0.5431)  closs: 0.8633 (0.8985)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:25:16.195929] Epoch: [2]  [100/812]  lr: 0.000036  grad_norm: 0.5328 (0.5437)  closs: 0.8633 (0.8997)  time: 1.2207  data: 0.0002  max mem: 17666
+[05:25:28.433549] Epoch: [2]  [110/812]  lr: 0.000036  grad_norm: 0.5378 (0.5440)  closs: 0.8652 (0.8944)  time: 1.2217  data: 0.0002  max mem: 17666
+[05:25:40.630768] Epoch: [2]  [120/812]  lr: 0.000036  grad_norm: 0.5295 (0.5427)  closs: 0.8916 (0.8959)  time: 1.2217  data: 0.0002  max mem: 17666
+[05:25:52.846608] Epoch: [2]  [130/812]  lr: 0.000035  grad_norm: 0.5252 (0.5422)  closs: 0.8785 (0.8946)  time: 1.2206  data: 0.0002  max mem: 17666
+[05:26:05.118006] Epoch: [2]  [140/812]  lr: 0.000035  grad_norm: 0.5381 (0.5433)  closs: 0.8633 (0.8939)  time: 1.2243  data: 0.0002  max mem: 17666
+[05:26:17.382923] Epoch: [2]  [150/812]  lr: 0.000035  grad_norm: 0.5409 (0.5437)  closs: 0.8702 (0.8949)  time: 1.2267  data: 0.0002  max mem: 17666
+[05:26:29.624515] Epoch: [2]  [160/812]  lr: 0.000035  grad_norm: 0.5378 (0.5439)  closs: 0.9324 (0.8958)  time: 1.2253  data: 0.0002  max mem: 17666
+[05:26:41.866362] Epoch: [2]  [170/812]  lr: 0.000034  grad_norm: 0.5373 (0.5433)  closs: 0.8786 (0.8948)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:26:54.115542] Epoch: [2]  [180/812]  lr: 0.000034  grad_norm: 0.5291 (0.5426)  closs: 0.8532 (0.8941)  time: 1.2245  data: 0.0002  max mem: 17666
+[05:27:06.354062] Epoch: [2]  [190/812]  lr: 0.000034  grad_norm: 0.5375 (0.5448)  closs: 0.8871 (0.8957)  time: 1.2243  data: 0.0002  max mem: 17666
+[05:27:18.583612] Epoch: [2]  [200/812]  lr: 0.000033  grad_norm: 0.5450 (0.5459)  closs: 0.9092 (0.8974)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:27:30.827297] Epoch: [2]  [210/812]  lr: 0.000033  grad_norm: 0.5450 (0.5465)  closs: 0.8947 (0.8979)  time: 1.2236  data: 0.0002  max mem: 17666
+[05:27:43.099277] Epoch: [2]  [220/812]  lr: 0.000033  grad_norm: 0.5412 (0.5470)  closs: 0.9164 (0.8990)  time: 1.2257  data: 0.0002  max mem: 17666
+[05:27:55.311466] Epoch: [2]  [230/812]  lr: 0.000033  grad_norm: 0.5342 (0.5467)  closs: 0.9164 (0.8996)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:28:07.527387] Epoch: [2]  [240/812]  lr: 0.000032  grad_norm: 0.5269 (0.5467)  closs: 0.8845 (0.8995)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:28:19.757509] Epoch: [2]  [250/812]  lr: 0.000032  grad_norm: 0.5346 (0.5467)  closs: 0.8747 (0.9001)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:28:31.953868] Epoch: [2]  [260/812]  lr: 0.000032  grad_norm: 0.5462 (0.5475)  closs: 0.8479 (0.8997)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:28:44.170627] Epoch: [2]  [270/812]  lr: 0.000031  grad_norm: 0.5350 (0.5466)  closs: 0.8574 (0.9012)  time: 1.2206  data: 0.0002  max mem: 17666
+[05:28:56.370480] Epoch: [2]  [280/812]  lr: 0.000031  grad_norm: 0.5331 (0.5468)  closs: 0.8961 (0.9028)  time: 1.2208  data: 0.0002  max mem: 17666
+[05:29:08.686787] Epoch: [2]  [290/812]  lr: 0.000031  grad_norm: 0.5416 (0.5475)  closs: 0.8706 (0.8995)  time: 1.2257  data: 0.0002  max mem: 17666
+[05:29:20.929953] Epoch: [2]  [300/812]  lr: 0.000031  grad_norm: 0.5308 (0.5467)  closs: 0.8766 (0.8992)  time: 1.2279  data: 0.0002  max mem: 17666
+[05:29:33.133327] Epoch: [2]  [310/812]  lr: 0.000030  grad_norm: 0.5419 (0.5486)  closs: 0.9146 (0.8993)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:29:45.380974] Epoch: [2]  [320/812]  lr: 0.000030  grad_norm: 0.5416 (0.5480)  closs: 0.9215 (0.9009)  time: 1.2225  data: 0.0002  max mem: 17666
+[05:29:57.586882] Epoch: [2]  [330/812]  lr: 0.000030  grad_norm: 0.5708 (0.5498)  closs: 0.9306 (0.9025)  time: 1.2226  data: 0.0002  max mem: 17666
+[05:30:09.877899] Epoch: [2]  [340/812]  lr: 0.000029  grad_norm: 0.5762 (0.5500)  closs: 0.9558 (0.9035)  time: 1.2248  data: 0.0002  max mem: 17666
+[05:30:22.088804] Epoch: [2]  [350/812]  lr: 0.000029  grad_norm: 0.5395 (0.5498)  closs: 0.9168 (0.9032)  time: 1.2250  data: 0.0002  max mem: 17666
+[05:30:34.326928] Epoch: [2]  [360/812]  lr: 0.000029  grad_norm: 0.5362 (0.5499)  closs: 0.8943 (0.9040)  time: 1.2224  data: 0.0002  max mem: 17666
+[05:30:46.554197] Epoch: [2]  [370/812]  lr: 0.000029  grad_norm: 0.5362 (0.5499)  closs: 0.8968 (0.9041)  time: 1.2232  data: 0.0002  max mem: 17666
+[05:30:58.787894] Epoch: [2]  [380/812]  lr: 0.000028  grad_norm: 0.5605 (0.5500)  closs: 0.8968 (0.9033)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:31:11.026268] Epoch: [2]  [390/812]  lr: 0.000028  grad_norm: 0.5654 (0.5508)  closs: 0.8928 (0.9019)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:31:23.250055] Epoch: [2]  [400/812]  lr: 0.000028  grad_norm: 0.5627 (0.5509)  closs: 0.8756 (0.9017)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:31:35.499135] Epoch: [2]  [410/812]  lr: 0.000027  grad_norm: 0.5489 (0.5520)  closs: 0.8756 (0.9014)  time: 1.2236  data: 0.0002  max mem: 17666
+[05:31:47.722078] Epoch: [2]  [420/812]  lr: 0.000027  grad_norm: 0.5539 (0.5521)  closs: 0.8572 (0.9015)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:32:00.005353] Epoch: [2]  [430/812]  lr: 0.000027  grad_norm: 0.5353 (0.5520)  closs: 0.8631 (0.9021)  time: 1.2252  data: 0.0002  max mem: 17666
+[05:32:12.261554] Epoch: [2]  [440/812]  lr: 0.000027  grad_norm: 0.5384 (0.5529)  closs: 0.9113 (0.9020)  time: 1.2269  data: 0.0002  max mem: 17666
+[05:32:24.476152] Epoch: [2]  [450/812]  lr: 0.000026  grad_norm: 0.5707 (0.5529)  closs: 0.9142 (0.9032)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:32:36.730553] Epoch: [2]  [460/812]  lr: 0.000026  grad_norm: 0.5230 (0.5534)  closs: 0.9659 (0.9053)  time: 1.2234  data: 0.0002  max mem: 17666
+[05:32:48.963624] Epoch: [2]  [470/812]  lr: 0.000026  grad_norm: 0.5223 (0.5531)  closs: 0.9463 (0.9048)  time: 1.2243  data: 0.0002  max mem: 17666
+[05:33:01.221874] Epoch: [2]  [480/812]  lr: 0.000025  grad_norm: 0.5196 (0.5530)  closs: 0.9217 (0.9058)  time: 1.2245  data: 0.0002  max mem: 17666
+[05:33:13.454427] Epoch: [2]  [490/812]  lr: 0.000025  grad_norm: 0.5165 (0.5525)  closs: 0.9051 (0.9051)  time: 1.2245  data: 0.0002  max mem: 17666
+[05:33:25.723568] Epoch: [2]  [500/812]  lr: 0.000025  grad_norm: 0.5228 (0.5525)  closs: 0.8909 (0.9047)  time: 1.2250  data: 0.0002  max mem: 17666
+[05:33:38.061671] Epoch: [2]  [510/812]  lr: 0.000024  grad_norm: 0.5380 (0.5523)  closs: 0.8909 (0.9047)  time: 1.2303  data: 0.0002  max mem: 17666
+[05:33:50.262136] Epoch: [2]  [520/812]  lr: 0.000024  grad_norm: 0.5317 (0.5519)  closs: 0.8782 (0.9047)  time: 1.2268  data: 0.0002  max mem: 17666
+[05:34:02.535828] Epoch: [2]  [530/812]  lr: 0.000024  grad_norm: 0.5358 (0.5519)  closs: 0.8886 (0.9047)  time: 1.2236  data: 0.0002  max mem: 17666
+[05:34:14.922873] Epoch: [2]  [540/812]  lr: 0.000024  grad_norm: 0.5384 (0.5516)  closs: 0.8886 (0.9045)  time: 1.2330  data: 0.0002  max mem: 17666
+[05:34:27.171951] Epoch: [2]  [550/812]  lr: 0.000023  grad_norm: 0.5198 (0.5508)  closs: 0.8887 (0.9045)  time: 1.2317  data: 0.0002  max mem: 17666
+[05:34:39.396660] Epoch: [2]  [560/812]  lr: 0.000023  grad_norm: 0.5260 (0.5509)  closs: 0.8877 (0.9046)  time: 1.2236  data: 0.0002  max mem: 17666
+[05:34:51.640988] Epoch: [2]  [570/812]  lr: 0.000023  grad_norm: 0.5627 (0.5513)  closs: 0.8820 (0.9039)  time: 1.2234  data: 0.0001  max mem: 17666
+[05:35:03.912231] Epoch: [2]  [580/812]  lr: 0.000022  grad_norm: 0.5364 (0.5511)  closs: 0.8524 (0.9029)  time: 1.2257  data: 0.0002  max mem: 17666
+[05:35:16.149707] Epoch: [2]  [590/812]  lr: 0.000022  grad_norm: 0.5502 (0.5518)  closs: 0.8858 (0.9019)  time: 1.2254  data: 0.0002  max mem: 17666
+[05:35:28.402157] Epoch: [2]  [600/812]  lr: 0.000022  grad_norm: 0.5616 (0.5521)  closs: 0.8858 (0.9020)  time: 1.2244  data: 0.0001  max mem: 17666
+[05:35:40.600967] Epoch: [2]  [610/812]  lr: 0.000022  grad_norm: 0.5462 (0.5522)  closs: 0.8935 (0.9018)  time: 1.2225  data: 0.0001  max mem: 17666
+[05:35:52.829356] Epoch: [2]  [620/812]  lr: 0.000021  grad_norm: 0.5462 (0.5521)  closs: 0.8935 (0.9016)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:36:05.050971] Epoch: [2]  [630/812]  lr: 0.000021  grad_norm: 0.5440 (0.5521)  closs: 0.8583 (0.9015)  time: 1.2224  data: 0.0002  max mem: 17666
+[05:36:17.274985] Epoch: [2]  [640/812]  lr: 0.000021  grad_norm: 0.5375 (0.5518)  closs: 0.8728 (0.9015)  time: 1.2222  data: 0.0002  max mem: 17666
+[05:36:29.516934] Epoch: [2]  [650/812]  lr: 0.000021  grad_norm: 0.5418 (0.5523)  closs: 0.8445 (0.8999)  time: 1.2232  data: 0.0002  max mem: 17666
+[05:36:41.764958] Epoch: [2]  [660/812]  lr: 0.000020  grad_norm: 0.5287 (0.5519)  closs: 0.8310 (0.8993)  time: 1.2244  data: 0.0002  max mem: 17666
+[05:36:54.012876] Epoch: [2]  [670/812]  lr: 0.000020  grad_norm: 0.5246 (0.5520)  closs: 0.8673 (0.8988)  time: 1.2247  data: 0.0002  max mem: 17666
+[05:37:06.248293] Epoch: [2]  [680/812]  lr: 0.000020  grad_norm: 0.5323 (0.5519)  closs: 0.9063 (0.8993)  time: 1.2241  data: 0.0002  max mem: 17666
+[05:37:18.499422] Epoch: [2]  [690/812]  lr: 0.000019  grad_norm: 0.5320 (0.5520)  closs: 0.9313 (0.8995)  time: 1.2243  data: 0.0002  max mem: 17666
+[05:37:30.723896] Epoch: [2]  [700/812]  lr: 0.000019  grad_norm: 0.5400 (0.5520)  closs: 0.8904 (0.8992)  time: 1.2237  data: 0.0002  max mem: 17666
+[05:37:42.959287] Epoch: [2]  [710/812]  lr: 0.000019  grad_norm: 0.5535 (0.5522)  closs: 0.8538 (0.8986)  time: 1.2229  data: 0.0002  max mem: 17666
+[05:37:55.217857] Epoch: [2]  [720/812]  lr: 0.000019  grad_norm: 0.5448 (0.5520)  closs: 0.8520 (0.8987)  time: 1.2246  data: 0.0002  max mem: 17666
+[05:38:07.427583] Epoch: [2]  [730/812]  lr: 0.000018  grad_norm: 0.5490 (0.5522)  closs: 0.8863 (0.8994)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:38:19.686100] Epoch: [2]  [740/812]  lr: 0.000018  grad_norm: 0.5623 (0.5524)  closs: 0.8536 (0.8994)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:38:31.903113] Epoch: [2]  [750/812]  lr: 0.000018  grad_norm: 0.5355 (0.5521)  closs: 0.8654 (0.8993)  time: 1.2237  data: 0.0002  max mem: 17666
+[05:38:44.115597] Epoch: [2]  [760/812]  lr: 0.000018  grad_norm: 0.5332 (0.5522)  closs: 0.8976 (0.8987)  time: 1.2214  data: 0.0002  max mem: 17666
+[05:38:56.321914] Epoch: [2]  [770/812]  lr: 0.000017  grad_norm: 0.5436 (0.5522)  closs: 0.9065 (0.8994)  time: 1.2209  data: 0.0002  max mem: 17666
+[05:39:08.539215] Epoch: [2]  [780/812]  lr: 0.000017  grad_norm: 0.5429 (0.5522)  closs: 0.9551 (0.8999)  time: 1.2211  data: 0.0002  max mem: 17666
+[05:39:20.776202] Epoch: [2]  [790/812]  lr: 0.000017  grad_norm: 0.5429 (0.5520)  closs: 0.9152 (0.9003)  time: 1.2226  data: 0.0001  max mem: 17666
+[05:39:33.004979] Epoch: [2]  [800/812]  lr: 0.000017  grad_norm: 0.5364 (0.5520)  closs: 0.8958 (0.9006)  time: 1.2232  data: 0.0001  max mem: 17666
+[05:39:45.263298] Epoch: [2]  [810/812]  lr: 0.000016  grad_norm: 0.5421 (0.5519)  closs: 0.9132 (0.9010)  time: 1.2243  data: 0.0001  max mem: 17666
+[05:39:46.761821] Epoch: [2] Total time: 0:16:35
+[05:39:46.769483] Averaged stats: lr: 0.000016  grad_norm: 0.5364 (0.5519)  closs: 0.9132 (0.9010)
+[05:39:47.092441] model saved
+[05:39:48.487045] optimizer saved
+[05:39:48.487749] other rank-common saved
+[05:39:48.491225] rank-specific saved
+[05:39:48.499785] log_dir: ./output_dir
+[05:39:50.944123] Epoch: [3]  [0/812]  lr: 0.000016  grad_norm: 0.5562 (0.5562)  closs: 0.6677 (0.6677)  time: 2.4434  data: 1.2020  max mem: 17666
+[05:40:03.157106] Epoch: [3]  [10/812]  lr: 0.000016  grad_norm: 0.5337 (0.5507)  closs: 0.9688 (0.9449)  time: 1.3323  data: 0.1094  max mem: 17666
+[05:40:15.422338] Epoch: [3]  [20/812]  lr: 0.000016  grad_norm: 0.5337 (0.5547)  closs: 0.9130 (0.9159)  time: 1.2238  data: 0.0002  max mem: 17666
+[05:40:27.617120] Epoch: [3]  [30/812]  lr: 0.000016  grad_norm: 0.5153 (0.5406)  closs: 0.9044 (0.9138)  time: 1.2229  data: 0.0002  max mem: 17666
+[05:40:39.859947] Epoch: [3]  [40/812]  lr: 0.000015  grad_norm: 0.5153 (0.5433)  closs: 0.8974 (0.9028)  time: 1.2218  data: 0.0002  max mem: 17666
+[05:40:52.133298] Epoch: [3]  [50/812]  lr: 0.000015  grad_norm: 0.5365 (0.5437)  closs: 0.9004 (0.9038)  time: 1.2257  data: 0.0002  max mem: 17666
+[05:41:04.316686] Epoch: [3]  [60/812]  lr: 0.000015  grad_norm: 0.5300 (0.5426)  closs: 0.8967 (0.9080)  time: 1.2228  data: 0.0002  max mem: 17666
+[05:41:16.709159] Epoch: [3]  [70/812]  lr: 0.000015  grad_norm: 0.5389 (0.5437)  closs: 0.8772 (0.9053)  time: 1.2287  data: 0.0002  max mem: 17666
+[05:41:28.981080] Epoch: [3]  [80/812]  lr: 0.000014  grad_norm: 0.5358 (0.5439)  closs: 0.8995 (0.9040)  time: 1.2331  data: 0.0002  max mem: 17666
+[05:41:41.218043] Epoch: [3]  [90/812]  lr: 0.000014  grad_norm: 0.5358 (0.5482)  closs: 0.9016 (0.9026)  time: 1.2254  data: 0.0002  max mem: 17666
+[05:41:53.459622] Epoch: [3]  [100/812]  lr: 0.000014  grad_norm: 0.5407 (0.5488)  closs: 0.8973 (0.9010)  time: 1.2238  data: 0.0002  max mem: 17666
+[05:42:05.684572] Epoch: [3]  [110/812]  lr: 0.000014  grad_norm: 0.5407 (0.5518)  closs: 0.8370 (0.8944)  time: 1.2233  data: 0.0002  max mem: 17666
+[05:42:17.955761] Epoch: [3]  [120/812]  lr: 0.000013  grad_norm: 0.5543 (0.5515)  closs: 0.8370 (0.8983)  time: 1.2247  data: 0.0002  max mem: 17666
+[05:42:30.181846] Epoch: [3]  [130/812]  lr: 0.000013  grad_norm: 0.5379 (0.5507)  closs: 0.8520 (0.8947)  time: 1.2248  data: 0.0002  max mem: 17666
+[05:42:42.456814] Epoch: [3]  [140/812]  lr: 0.000013  grad_norm: 0.5309 (0.5496)  closs: 0.8633 (0.8987)  time: 1.2250  data: 0.0002  max mem: 17666
+[05:42:54.678890] Epoch: [3]  [150/812]  lr: 0.000013  grad_norm: 0.5271 (0.5505)  closs: 0.9299 (0.8988)  time: 1.2248  data: 0.0002  max mem: 17666
+[05:43:06.935982] Epoch: [3]  [160/812]  lr: 0.000012  grad_norm: 0.5488 (0.5496)  closs: 0.8740 (0.8989)  time: 1.2239  data: 0.0002  max mem: 17666
+[05:43:19.181914] Epoch: [3]  [170/812]  lr: 0.000012  grad_norm: 0.5464 (0.5495)  closs: 0.8753 (0.8977)  time: 1.2251  data: 0.0002  max mem: 17666
+[05:43:31.400721] Epoch: [3]  [180/812]  lr: 0.000012  grad_norm: 0.5487 (0.5500)  closs: 0.8691 (0.8962)  time: 1.2232  data: 0.0002  max mem: 17666
+[05:43:43.639535] Epoch: [3]  [190/812]  lr: 0.000012  grad_norm: 0.5439 (0.5500)  closs: 0.8474 (0.8957)  time: 1.2228  data: 0.0002  max mem: 17666
+[05:43:55.888343] Epoch: [3]  [200/812]  lr: 0.000012  grad_norm: 0.5147 (0.5471)  closs: 0.9034 (0.8973)  time: 1.2243  data: 0.0002  max mem: 17666
+[05:44:08.136204] Epoch: [3]  [210/812]  lr: 0.000011  grad_norm: 0.5083 (0.5460)  closs: 0.8917 (0.8965)  time: 1.2248  data: 0.0002  max mem: 17666
+[05:44:20.360724] Epoch: [3]  [220/812]  lr: 0.000011  grad_norm: 0.5294 (0.5456)  closs: 0.8673 (0.8980)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:44:32.597597] Epoch: [3]  [230/812]  lr: 0.000011  grad_norm: 0.5309 (0.5452)  closs: 0.8934 (0.8987)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:44:44.822067] Epoch: [3]  [240/812]  lr: 0.000011  grad_norm: 0.5445 (0.5462)  closs: 0.8918 (0.8988)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:44:57.071616] Epoch: [3]  [250/812]  lr: 0.000011  grad_norm: 0.5577 (0.5471)  closs: 0.8841 (0.8983)  time: 1.2236  data: 0.0002  max mem: 17666
+[05:45:09.334382] Epoch: [3]  [260/812]  lr: 0.000010  grad_norm: 0.5525 (0.5469)  closs: 0.8411 (0.8965)  time: 1.2255  data: 0.0002  max mem: 17666
+[05:45:21.576233] Epoch: [3]  [270/812]  lr: 0.000010  grad_norm: 0.5297 (0.5460)  closs: 0.8316 (0.8970)  time: 1.2252  data: 0.0002  max mem: 17666
+[05:45:33.792847] Epoch: [3]  [280/812]  lr: 0.000010  grad_norm: 0.5256 (0.5458)  closs: 0.8528 (0.8986)  time: 1.2228  data: 0.0002  max mem: 17666
+[05:45:45.985161] Epoch: [3]  [290/812]  lr: 0.000010  grad_norm: 0.5459 (0.5469)  closs: 0.8481 (0.8978)  time: 1.2204  data: 0.0002  max mem: 17666
+[05:45:58.246023] Epoch: [3]  [300/812]  lr: 0.000010  grad_norm: 0.5559 (0.5478)  closs: 0.8454 (0.8950)  time: 1.2226  data: 0.0002  max mem: 17666
+[05:46:10.538520] Epoch: [3]  [310/812]  lr: 0.000010  grad_norm: 0.5453 (0.5486)  closs: 0.8714 (0.8966)  time: 1.2276  data: 0.0002  max mem: 17666
+[05:46:22.758193] Epoch: [3]  [320/812]  lr: 0.000009  grad_norm: 0.5333 (0.5487)  closs: 0.9090 (0.8970)  time: 1.2255  data: 0.0002  max mem: 17666
+[05:46:35.009523] Epoch: [3]  [330/812]  lr: 0.000009  grad_norm: 0.5240 (0.5482)  closs: 0.9090 (0.8967)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:46:47.248170] Epoch: [3]  [340/812]  lr: 0.000009  grad_norm: 0.5495 (0.5481)  closs: 0.9205 (0.8972)  time: 1.2244  data: 0.0002  max mem: 17666
+[05:46:59.513079] Epoch: [3]  [350/812]  lr: 0.000009  grad_norm: 0.5428 (0.5478)  closs: 0.9117 (0.8979)  time: 1.2251  data: 0.0002  max mem: 17666
+[05:47:11.756475] Epoch: [3]  [360/812]  lr: 0.000009  grad_norm: 0.5528 (0.5483)  closs: 0.9040 (0.8985)  time: 1.2253  data: 0.0002  max mem: 17666
+[05:47:24.001759] Epoch: [3]  [370/812]  lr: 0.000009  grad_norm: 0.5583 (0.5497)  closs: 0.9428 (0.8988)  time: 1.2244  data: 0.0002  max mem: 17666
+[05:47:36.258369] Epoch: [3]  [380/812]  lr: 0.000008  grad_norm: 0.5532 (0.5498)  closs: 0.8826 (0.8977)  time: 1.2250  data: 0.0002  max mem: 17666
+[05:47:48.534397] Epoch: [3]  [390/812]  lr: 0.000008  grad_norm: 0.5464 (0.5498)  closs: 0.8169 (0.8967)  time: 1.2266  data: 0.0002  max mem: 17666
+[05:48:00.757119] Epoch: [3]  [400/812]  lr: 0.000008  grad_norm: 0.5397 (0.5499)  closs: 0.8881 (0.8972)  time: 1.2249  data: 0.0002  max mem: 17666
+[05:48:12.965384] Epoch: [3]  [410/812]  lr: 0.000008  grad_norm: 0.5424 (0.5500)  closs: 0.8712 (0.8959)  time: 1.2215  data: 0.0002  max mem: 17666
+[05:48:25.205411] Epoch: [3]  [420/812]  lr: 0.000008  grad_norm: 0.5482 (0.5500)  closs: 0.8673 (0.8952)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:48:37.398291] Epoch: [3]  [430/812]  lr: 0.000008  grad_norm: 0.5379 (0.5502)  closs: 0.8604 (0.8939)  time: 1.2216  data: 0.0002  max mem: 17666
+[05:48:49.634277] Epoch: [3]  [440/812]  lr: 0.000008  grad_norm: 0.5367 (0.5501)  closs: 0.8604 (0.8932)  time: 1.2214  data: 0.0002  max mem: 17666
+[05:49:01.842468] Epoch: [3]  [450/812]  lr: 0.000007  grad_norm: 0.5338 (0.5509)  closs: 0.8707 (0.8928)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:49:14.048276] Epoch: [3]  [460/812]  lr: 0.000007  grad_norm: 0.5654 (0.5517)  closs: 0.8607 (0.8921)  time: 1.2206  data: 0.0002  max mem: 17666
+[05:49:26.270248] Epoch: [3]  [470/812]  lr: 0.000007  grad_norm: 0.5506 (0.5519)  closs: 0.8607 (0.8917)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:49:38.467867] Epoch: [3]  [480/812]  lr: 0.000007  grad_norm: 0.5433 (0.5522)  closs: 0.8556 (0.8913)  time: 1.2209  data: 0.0002  max mem: 17666
+[05:49:50.683011] Epoch: [3]  [490/812]  lr: 0.000007  grad_norm: 0.5388 (0.5518)  closs: 0.8579 (0.8916)  time: 1.2206  data: 0.0002  max mem: 17666
+[05:50:02.890194] Epoch: [3]  [500/812]  lr: 0.000007  grad_norm: 0.5285 (0.5519)  closs: 0.8663 (0.8911)  time: 1.2210  data: 0.0002  max mem: 17666
+[05:50:15.194581] Epoch: [3]  [510/812]  lr: 0.000007  grad_norm: 0.5267 (0.5519)  closs: 0.8912 (0.8925)  time: 1.2255  data: 0.0002  max mem: 17666
+[05:50:27.434194] Epoch: [3]  [520/812]  lr: 0.000007  grad_norm: 0.5485 (0.5521)  closs: 0.9753 (0.8933)  time: 1.2271  data: 0.0002  max mem: 17666
+[05:50:39.651081] Epoch: [3]  [530/812]  lr: 0.000006  grad_norm: 0.5609 (0.5526)  closs: 0.8542 (0.8926)  time: 1.2228  data: 0.0002  max mem: 17666
+[05:50:51.885523] Epoch: [3]  [540/812]  lr: 0.000006  grad_norm: 0.5575 (0.5528)  closs: 0.8232 (0.8925)  time: 1.2225  data: 0.0002  max mem: 17666
+[05:51:04.114756] Epoch: [3]  [550/812]  lr: 0.000006  grad_norm: 0.5378 (0.5530)  closs: 0.8856 (0.8916)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:51:16.348968] Epoch: [3]  [560/812]  lr: 0.000006  grad_norm: 0.5246 (0.5528)  closs: 0.8477 (0.8911)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:51:28.542827] Epoch: [3]  [570/812]  lr: 0.000006  grad_norm: 0.5556 (0.5531)  closs: 0.8762 (0.8917)  time: 1.2213  data: 0.0002  max mem: 17666
+[05:51:40.755871] Epoch: [3]  [580/812]  lr: 0.000006  grad_norm: 0.5559 (0.5532)  closs: 0.9160 (0.8922)  time: 1.2203  data: 0.0002  max mem: 17666
+[05:51:52.996942] Epoch: [3]  [590/812]  lr: 0.000006  grad_norm: 0.5523 (0.5531)  closs: 0.8803 (0.8920)  time: 1.2226  data: 0.0002  max mem: 17666
+[05:52:05.218398] Epoch: [3]  [600/812]  lr: 0.000006  grad_norm: 0.5592 (0.5532)  closs: 0.8175 (0.8915)  time: 1.2231  data: 0.0002  max mem: 17666
+[05:52:17.508068] Epoch: [3]  [610/812]  lr: 0.000006  grad_norm: 0.5655 (0.5533)  closs: 0.9026 (0.8923)  time: 1.2255  data: 0.0002  max mem: 17666
+[05:52:29.722507] Epoch: [3]  [620/812]  lr: 0.000006  grad_norm: 0.5540 (0.5537)  closs: 0.8860 (0.8926)  time: 1.2251  data: 0.0002  max mem: 17666
+[05:52:41.951732] Epoch: [3]  [630/812]  lr: 0.000006  grad_norm: 0.5512 (0.5543)  closs: 0.8709 (0.8918)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:52:54.184550] Epoch: [3]  [640/812]  lr: 0.000006  grad_norm: 0.5541 (0.5544)  closs: 0.8467 (0.8915)  time: 1.2230  data: 0.0002  max mem: 17666
+[05:53:06.428821] Epoch: [3]  [650/812]  lr: 0.000005  grad_norm: 0.5534 (0.5547)  closs: 0.8995 (0.8926)  time: 1.2238  data: 0.0002  max mem: 17666
+[05:53:18.690090] Epoch: [3]  [660/812]  lr: 0.000005  grad_norm: 0.5558 (0.5549)  closs: 0.9336 (0.8932)  time: 1.2252  data: 0.0002  max mem: 17666
+[05:53:30.911141] Epoch: [3]  [670/812]  lr: 0.000005  grad_norm: 0.5558 (0.5548)  closs: 0.8835 (0.8925)  time: 1.2240  data: 0.0002  max mem: 17666
+[05:53:43.161458] Epoch: [3]  [680/812]  lr: 0.000005  grad_norm: 0.5321 (0.5547)  closs: 0.8819 (0.8931)  time: 1.2235  data: 0.0002  max mem: 17666
+[05:53:55.371169] Epoch: [3]  [690/812]  lr: 0.000005  grad_norm: 0.5240 (0.5545)  closs: 0.8819 (0.8929)  time: 1.2229  data: 0.0002  max mem: 17666
+[05:54:07.586578] Epoch: [3]  [700/812]  lr: 0.000005  grad_norm: 0.5243 (0.5545)  closs: 0.8890 (0.8933)  time: 1.2212  data: 0.0002  max mem: 17666
+[05:54:19.795256] Epoch: [3]  [710/812]  lr: 0.000005  grad_norm: 0.5534 (0.5550)  closs: 0.9166 (0.8937)  time: 1.2211  data: 0.0002  max mem: 17666
+[05:54:32.015414] Epoch: [3]  [720/812]  lr: 0.000005  grad_norm: 0.5337 (0.5548)  closs: 0.9047 (0.8940)  time: 1.2214  data: 0.0002  max mem: 17666
+[05:54:44.241974] Epoch: [3]  [730/812]  lr: 0.000005  grad_norm: 0.5269 (0.5549)  closs: 0.8744 (0.8932)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:54:56.452329] Epoch: [3]  [740/812]  lr: 0.000005  grad_norm: 0.5342 (0.5546)  closs: 0.8736 (0.8936)  time: 1.2218  data: 0.0002  max mem: 17666
+[05:55:08.685255] Epoch: [3]  [750/812]  lr: 0.000005  grad_norm: 0.5347 (0.5546)  closs: 0.8736 (0.8932)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:55:20.895362] Epoch: [3]  [760/812]  lr: 0.000005  grad_norm: 0.5396 (0.5548)  closs: 0.8711 (0.8937)  time: 1.2221  data: 0.0002  max mem: 17666
+[05:55:33.133468] Epoch: [3]  [770/812]  lr: 0.000005  grad_norm: 0.5376 (0.5548)  closs: 0.8961 (0.8938)  time: 1.2223  data: 0.0002  max mem: 17666
+[05:55:45.329927] Epoch: [3]  [780/812]  lr: 0.000005  grad_norm: 0.5376 (0.5551)  closs: 0.8806 (0.8935)  time: 1.2217  data: 0.0002  max mem: 17666
+[05:55:57.537681] Epoch: [3]  [790/812]  lr: 0.000005  grad_norm: 0.5535 (0.5551)  closs: 0.8785 (0.8936)  time: 1.2201  data: 0.0001  max mem: 17666
+[05:56:09.778726] Epoch: [3]  [800/812]  lr: 0.000005  grad_norm: 0.5479 (0.5551)  closs: 0.9185 (0.8940)  time: 1.2224  data: 0.0001  max mem: 17666
+[05:56:21.990260] Epoch: [3]  [810/812]  lr: 0.000005  grad_norm: 0.5479 (0.5552)  closs: 0.9699 (0.8947)  time: 1.2226  data: 0.0001  max mem: 17666
+[05:56:23.509702] Epoch: [3] Total time: 0:16:35
+[05:56:23.512943] Averaged stats: lr: 0.000005  grad_norm: 0.5555 (0.5554)  closs: 0.9666 (0.8950)
+[05:56:23.780574] model saved
+[05:56:25.200026] optimizer saved
+[05:56:25.200659] other rank-common saved
+[05:56:25.203616] rank-specific saved
+[05:56:25.203811] Training time 1:06:29
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..42b3545022ed0c9ba4047a2d43746d8e0678966d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43f79632e6a7ff0d46489b43cff701139e8645f43bec7e32cb5d687cb503cec0
+size 5206987
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b3a347fbf41eea2d382e98d812fb29debb9954e4
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2494a26669b4d50d54988024ed1ee0056139168b77a4b96f33edf4537324bff7
+size 20612285
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6580ed46844c8196025c2b4408c0a94b11a3286d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97d31e98bf3b50a9d9871169f77286c5546169adca7c3a2a188449c713f14f8c
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f8c3a35cca5a559e355024cb252d318095ae46c9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa15182e7ab119a25276d512b31db08d8b47ea700b4ac7f2e3e2820f453621c2
+size 5206987
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4ff6e848e440516d8813521768ce52cc4c0e7ab9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c755bf0c774ab128c975ad3111d38c03d5fc7a8083be54a8bd6130936e9c189d
+size 20612285
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..019815c54f38ca0117b2de142a833f36cf716f12
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bd92ff1afa34a36c0974183d0b5c5014a24b768cafce4c0456053efdb3be65f
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..092ee2ad6f9d75acb8f5d6c4189c9c2312533f4d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84a7de8305a6b1286201270e9f6452693b86ab8a28eac06237986264f15f491c
+size 5206987
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cd87dc791566e0b6234c1981ceb12bcb1b9dd161
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d439fa2c56094ad77e821131dda743becb65d0c43f025ccea1a7b1e031730cd
+size 20612285
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f69f0d31ce8e9530fb0924334a376441d7622460
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3767b207334656ecfef01d2de48627986334fc7f316b6a0764dbd5d18e40093c
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e07dffae975107d6528e85cde208d0a2a3795dbd
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18f024463d38fab32865ace8ed97a55e10a820080f9a8d9377e5c94615ea9e8f
+size 5206987
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2cb084d7f45451eee4c753268d244361a7a8dcda
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e148b43413f2c3e62b92ae2c9995effef04ecc773c694b65ba52897c00b9497b
+size 20612285
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..eb934285a6ff5bf3f441fd2d3de536082f91905d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9002aa74e4a1bfbe09add178be306f8d719a30c9a5ec892a01f20ffc4116ff8c
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d21cd68fdc1fa6ba68b8123cb153f0df70ff1937
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt
@@ -0,0 +1,4 @@
+{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.42762760063770955, "train_closs": 0.9074021448872139, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.42762760063770955, "val_closs": 0.9074021448872139}
+{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.3180465380583197, "train_closs": 0.8586826063417302, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.3180465380583197, "val_closs": 0.8586826063417302}
+{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.3192772727845044, "train_closs": 0.8497503121323847, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.3192772727845044, "val_closs": 0.8497503121323847}
+{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.3224347219274843, "train_closs": 0.846084712903598, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.3224347219274843, "val_closs": 0.846084712903598}
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..233da8dd518b474697fae9e4b91af66a3b7cc39f
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log
@@ -0,0 +1,591 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 6): env://, gpu 6
+| distributed init (rank 2): env://, gpu 2
+| distributed init (rank 5): env://, gpu 5
+| distributed init (rank 4): env://, gpu 4
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 7): env://, gpu 7
+| distributed init (rank 3): env://, gpu 3
+[02:21:24.645037] > initializing model parallel with size 1
+[02:21:24.645129] > initializing ddp with size 8
+[02:21:24.645136] > initializing pipeline with size 1
+[02:21:24.802227] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[02:21:24.802312] Namespace(batch_size=8,
+accum_iter=1,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-13b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-13b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B',
+log_dir='./output_dir',
+save_interval=1,
+only_save_trainable=True,
+device='cuda',
+seed=0,
+resume='',
+num_workers=24,
+pin_mem=True,
+world_size=8,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[02:21:24.803162] Start initialization.
+[02:21:24.803195] ## Processing on RANK 0.
+[02:21:24.813174] Model Args:
+ ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[02:22:55.142663] Model is Peft: True
+[02:22:55.146562] Trainable parameter count : 2544640 (local rank), 2544640 (all).
+[02:22:55.156925] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:23:23.489563] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:09, 49.23it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:46, 10.17it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:12,  6.48it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:53,  8.74it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:57,  8.06it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:53,  8.73it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:29,  5.16it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:45,  4.34it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:58,  7.73it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:01,  7.31it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:55,  8.09it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:32,  4.87it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:48,  4.13it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<00:58,  7.57it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:01,  7.17it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:54,  8.00it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:32,  4.72it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:49,  3.98it/s]Qunatization Process:  11%|█▏        | 56/487 [00:08<00:58,  7.33it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<01:02,  6.91it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:55,  7.69it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:32,  4.59it/s]Qunatization Process:  13%|█▎        | 63/487 [00:10<01:48,  3.89it/s]Qunatization Process:  14%|█▍        | 68/487 [00:10<00:58,  7.20it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<01:01,  6.83it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:54,  7.60it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:30,  4.57it/s]Qunatization Process:  15%|█▌        | 75/487 [00:12<01:46,  3.88it/s]Qunatization Process:  16%|█▋        | 80/487 [00:12<00:56,  7.17it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:59,  6.79it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:53,  7.58it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:27,  4.57it/s]Qunatization Process:  18%|█▊        | 87/487 [00:14<01:43,  3.87it/s]Qunatization Process:  19%|█▉        | 92/487 [00:14<00:55,  7.15it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:57,  6.79it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:51,  7.57it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:25,  4.56it/s]Qunatization Process:  20%|██        | 99/487 [00:16<01:40,  3.87it/s]Qunatization Process:  21%|██▏       | 104/487 [00:16<00:53,  7.15it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:56,  6.79it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:49,  7.59it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:22,  4.57it/s]Qunatization Process:  23%|██▎       | 111/487 [00:18<01:36,  3.88it/s]Qunatization Process:  24%|██▍       | 116/487 [00:18<00:51,  7.19it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:54,  6.81it/s]Qunatization Process:  25%|██▍       | 120/487 [00:19<00:48,  7.60it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:19,  4.57it/s]Qunatization Process:  25%|██▌       | 123/487 [00:20<01:33,  3.88it/s]Qunatization Process:  26%|██▋       | 128/487 [00:20<00:50,  7.18it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:52,  6.81it/s]Qunatization Process:  27%|██▋       | 132/487 [00:21<00:46,  7.60it/s]Qunatization Process:  28%|██▊       | 134/487 [00:22<01:17,  4.57it/s]Qunatization Process:  28%|██▊       | 135/487 [00:22<01:30,  3.88it/s]Qunatization Process:  29%|██▊       | 140/487 [00:22<00:48,  7.14it/s]Qunatization Process:  29%|██▉       | 142/487 [00:23<00:50,  6.79it/s]Qunatization Process:  30%|██▉       | 144/487 [00:23<00:45,  7.57it/s]Qunatization Process:  30%|██▉       | 146/487 [00:24<01:14,  4.57it/s]Qunatization Process:  30%|███       | 147/487 [00:24<01:27,  3.90it/s]Qunatization Process:  31%|███       | 152/487 [00:24<00:46,  7.25it/s]Qunatization Process:  32%|███▏      | 154/487 [00:25<00:47,  6.95it/s]Qunatization Process:  32%|███▏      | 156/487 [00:25<00:42,  7.80it/s]Qunatization Process:  32%|███▏      | 158/487 [00:26<01:09,  4.76it/s]Qunatization Process:  33%|███▎      | 159/487 [00:26<01:20,  4.05it/s]Qunatization Process:  34%|███▎      | 164/487 [00:26<00:43,  7.51it/s]Qunatization Process:  34%|███▍      | 166/487 [00:27<00:44,  7.14it/s]Qunatization Process:  34%|███▍      | 168/487 [00:27<00:40,  7.97it/s]Qunatization Process:  35%|███▍      | 170/487 [00:28<01:06,  4.80it/s]Qunatization Process:  35%|███▌      | 171/487 [00:28<01:17,  4.08it/s]Qunatization Process:  36%|███▌      | 176/487 [00:28<00:41,  7.55it/s]Qunatization Process:  37%|███▋      | 178/487 [00:29<00:43,  7.16it/s]Qunatization Process:  37%|███▋      | 180/487 [00:29<00:38,  7.99it/s]Qunatization Process:  37%|███▋      | 182/487 [00:30<01:03,  4.81it/s]Qunatization Process:  38%|███▊      | 183/487 [00:30<01:14,  4.09it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:39,  7.57it/s]Qunatization Process:  39%|███▉      | 190/487 [00:31<00:41,  7.17it/s]Qunatization Process:  39%|███▉      | 192/487 [00:31<00:36,  8.00it/s]Qunatization Process:  40%|███▉      | 194/487 [00:32<01:00,  4.81it/s]Qunatization Process:  40%|████      | 195/487 [00:32<01:11,  4.09it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:37,  7.56it/s]Qunatization Process:  41%|████▏     | 202/487 [00:32<00:39,  7.17it/s]Qunatization Process:  42%|████▏     | 204/487 [00:33<00:35,  7.99it/s]Qunatization Process:  42%|████▏     | 206/487 [00:34<00:58,  4.81it/s]Qunatization Process:  43%|████▎     | 207/487 [00:34<01:08,  4.09it/s]Qunatization Process:  44%|████▎     | 212/487 [00:34<00:36,  7.56it/s]Qunatization Process:  44%|████▍     | 214/487 [00:34<00:38,  7.15it/s]Qunatization Process:  44%|████▍     | 216/487 [00:35<00:33,  7.98it/s]Qunatization Process:  45%|████▍     | 218/487 [00:36<00:56,  4.80it/s]Qunatization Process:  45%|████▍     | 219/487 [00:36<01:05,  4.08it/s]Qunatization Process:  46%|████▌     | 224/487 [00:36<00:34,  7.54it/s]Qunatization Process:  46%|████▋     | 226/487 [00:36<00:36,  7.16it/s]Qunatization Process:  47%|████▋     | 228/487 [00:37<00:32,  7.96it/s]Qunatization Process:  47%|████▋     | 230/487 [00:37<00:53,  4.80it/s]Qunatization Process:  47%|████▋     | 231/487 [00:38<01:02,  4.08it/s]Qunatization Process:  48%|████▊     | 236/487 [00:38<00:33,  7.54it/s]Qunatization Process:  49%|████▉     | 238/487 [00:38<00:34,  7.14it/s]Qunatization Process:  49%|████▉     | 240/487 [00:39<00:31,  7.97it/s]Qunatization Process:  50%|████▉     | 242/487 [00:39<00:51,  4.79it/s]Qunatization Process:  50%|████▉     | 243/487 [00:40<00:59,  4.07it/s]Qunatization Process:  51%|█████     | 248/487 [00:40<00:31,  7.54it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:40<00:33,  7.14it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:41<00:29,  7.97it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:41<00:48,  4.80it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:42<00:56,  4.08it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:42<00:30,  7.55it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:42<00:31,  7.14it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:43<00:27,  7.97it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:43<00:46,  4.73it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:44<00:55,  3.97it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:44<00:29,  7.33it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:44<00:30,  6.89it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:45<00:27,  7.67it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:46<00:45,  4.58it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:46<00:53,  3.89it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:46<00:28,  7.18it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:47<00:29,  6.81it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:47<00:26,  7.57it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:48<00:43,  4.56it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:48<00:50,  3.87it/s]Qunatization Process:  61%|██████    | 296/487 [00:48<00:26,  7.17it/s]Qunatization Process:  61%|██████    | 298/487 [00:49<00:27,  6.78it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:49<00:24,  7.57it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:50<00:40,  4.55it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:50<00:47,  3.87it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:50<00:25,  7.15it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:51<00:26,  6.77it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:51<00:23,  7.56it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:52<00:38,  4.55it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:52<00:44,  3.86it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:52<00:23,  7.15it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:53<00:24,  6.77it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:53<00:21,  7.56it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:54<00:35,  4.56it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:54<00:41,  3.87it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:55<00:21,  7.14it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:55<00:22,  6.78it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:55<00:20,  7.55it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:56<00:32,  4.55it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:56<00:38,  3.87it/s]Qunatization Process:  71%|███████   | 344/487 [00:57<00:19,  7.16it/s]Qunatization Process:  71%|███████   | 346/487 [00:57<00:20,  6.78it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:57<00:18,  7.56it/s]Qunatization Process:  72%|███████▏  | 350/487 [00:58<00:30,  4.55it/s]Qunatization Process:  72%|███████▏  | 351/487 [00:59<00:35,  3.87it/s]Qunatization Process:  73%|███████▎  | 356/487 [00:59<00:18,  7.15it/s]Qunatization Process:  74%|███████▎  | 358/487 [00:59<00:19,  6.77it/s]Qunatization Process:  74%|███████▍  | 360/487 [00:59<00:16,  7.56it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:00<00:27,  4.56it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:01<00:31,  3.89it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:01<00:16,  7.17it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:01<00:17,  6.81it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:01<00:15,  7.58it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:02<00:24,  4.59it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:03<00:28,  3.90it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:03<00:14,  7.21it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:03<00:15,  6.83it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:03<00:13,  7.62it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:04<00:21,  4.59it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:05<00:25,  3.90it/s]Qunatization Process:  80%|████████  | 392/487 [01:05<00:13,  7.22it/s]Qunatization Process:  81%|████████  | 394/487 [01:05<00:13,  6.83it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:05<00:11,  7.62it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:06<00:19,  4.59it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:07<00:22,  3.91it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:07<00:11,  7.21it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:07<00:11,  6.84it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:08<00:10,  7.56it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:08<00:16,  4.57it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:09<00:19,  3.90it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:09<00:09,  7.20it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:09<00:10,  6.83it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:10<00:08,  7.62it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:11<00:14,  4.60it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:11<00:16,  3.89it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:11<00:08,  7.20it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:12<00:08,  6.77it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:12<00:07,  7.57it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:13<00:11,  4.57it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:13<00:13,  3.89it/s]Qunatization Process:  90%|█████████ | 440/487 [01:13<00:06,  7.20it/s]Qunatization Process:  91%|█████████ | 442/487 [01:14<00:06,  6.82it/s]Qunatization Process:  91%|█████████ | 444/487 [01:14<00:05,  7.61it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:15<00:08,  4.60it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:15<00:10,  3.91it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:15<00:04,  7.21it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:16<00:04,  6.84it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:16<00:04,  7.63it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:17<00:06,  4.60it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:17<00:07,  3.91it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:17<00:03,  7.22it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:18<00:03,  6.83it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:18<00:02,  7.62it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:19<00:03,  4.59it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:19<00:04,  3.90it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:19<00:01,  7.22it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:20<00:01,  6.84it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:20<00:00,  7.61it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:21<00:01,  4.60it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:21<00:01,  3.91it/s]Qunatization Process: 100%|██████████| 487/487 [01:22<00:00,  3.83it/s]Qunatization Process: 100%|██████████| 487/487 [01:22<00:00,  5.87it/s]
+[02:25:06.998468] ## Processing on RANK 1.
+[02:26:37.729409] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:27:09.142448] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 47.04it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:47, 10.06it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:12,  6.49it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:53,  8.79it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:57,  8.13it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:52,  8.81it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:27,  5.24it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:43,  4.42it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:57,  7.90it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:00,  7.45it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:54,  8.26it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:30,  4.96it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:46,  4.22it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<00:57,  7.74it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:00,  7.34it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:53,  8.16it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:29,  4.90it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:44,  4.17it/s]Qunatization Process:  11%|█▏        | 56/487 [00:07<00:55,  7.70it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:58,  7.31it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:52,  8.14it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:26,  4.89it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:41,  4.16it/s]Qunatization Process:  14%|█▍        | 68/487 [00:09<00:54,  7.70it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:57,  7.31it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:50,  8.14it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:24,  4.89it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:39,  4.15it/s]Qunatization Process:  16%|█▋        | 80/487 [00:11<00:53,  7.67it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:55,  7.24it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:50,  8.05it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:24,  4.74it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:39,  4.01it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:53,  7.42it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:55,  7.05it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:49,  7.88it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:23,  4.67it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:38,  3.96it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:52,  7.33it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:54,  6.98it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:48,  7.78it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:21,  4.64it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:35,  3.94it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:50,  7.29it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:53,  6.95it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:47,  7.78it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:18,  4.63it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:32,  3.94it/s]Qunatization Process:  26%|██▋       | 128/487 [00:19<00:49,  7.24it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:52,  6.81it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:46,  7.56it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:19,  4.44it/s]Qunatization Process:  28%|██▊       | 135/487 [00:21<01:33,  3.76it/s]Qunatization Process:  29%|██▊       | 140/487 [00:22<00:49,  6.96it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:52,  6.63it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:46,  7.41it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:17,  4.42it/s]Qunatization Process:  30%|███       | 147/487 [00:24<01:30,  3.75it/s]Qunatization Process:  31%|███       | 152/487 [00:24<00:48,  6.96it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:50,  6.63it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:44,  7.41it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:14,  4.43it/s]Qunatization Process:  33%|███▎      | 159/487 [00:26<01:27,  3.76it/s]Qunatization Process:  34%|███▎      | 164/487 [00:26<00:46,  6.96it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:48,  6.63it/s]Qunatization Process:  34%|███▍      | 168/487 [00:26<00:43,  7.41it/s]Qunatization Process:  35%|███▍      | 170/487 [00:27<01:11,  4.42it/s]Qunatization Process:  35%|███▌      | 171/487 [00:28<01:24,  3.74it/s]Qunatization Process:  36%|███▌      | 176/487 [00:28<00:44,  6.94it/s]Qunatization Process:  37%|███▋      | 178/487 [00:28<00:46,  6.63it/s]Qunatization Process:  37%|███▋      | 180/487 [00:29<00:41,  7.40it/s]Qunatization Process:  37%|███▋      | 182/487 [00:30<01:09,  4.40it/s]Qunatization Process:  38%|███▊      | 183/487 [00:30<01:21,  3.74it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:43,  6.94it/s]Qunatization Process:  39%|███▉      | 190/487 [00:31<00:45,  6.56it/s]Qunatization Process:  39%|███▉      | 192/487 [00:31<00:40,  7.35it/s]Qunatization Process:  40%|███▉      | 194/487 [00:32<01:06,  4.38it/s]Qunatization Process:  40%|████      | 195/487 [00:32<01:18,  3.70it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:41,  6.88it/s]Qunatization Process:  41%|████▏     | 202/487 [00:33<00:43,  6.58it/s]Qunatization Process:  42%|████▏     | 204/487 [00:33<00:38,  7.36it/s]Qunatization Process:  42%|████▏     | 206/487 [00:34<01:04,  4.35it/s]Qunatization Process:  43%|████▎     | 207/487 [00:34<01:15,  3.71it/s]Qunatization Process:  44%|████▎     | 212/487 [00:35<00:39,  6.90it/s]Qunatization Process:  44%|████▍     | 214/487 [00:35<00:41,  6.53it/s]Qunatization Process:  44%|████▍     | 216/487 [00:35<00:37,  7.32it/s]Qunatization Process:  45%|████▍     | 218/487 [00:36<01:01,  4.40it/s]Qunatization Process:  45%|████▍     | 219/487 [00:37<01:11,  3.74it/s]Qunatization Process:  46%|████▌     | 224/487 [00:37<00:37,  6.94it/s]Qunatization Process:  46%|████▋     | 226/487 [00:37<00:39,  6.61it/s]Qunatization Process:  47%|████▋     | 228/487 [00:37<00:35,  7.37it/s]Qunatization Process:  47%|████▋     | 230/487 [00:38<00:58,  4.41it/s]Qunatization Process:  47%|████▋     | 231/487 [00:39<01:08,  3.74it/s]Qunatization Process:  48%|████▊     | 236/487 [00:39<00:36,  6.94it/s]Qunatization Process:  49%|████▉     | 238/487 [00:39<00:37,  6.59it/s]Qunatization Process:  49%|████▉     | 240/487 [00:39<00:33,  7.37it/s]Qunatization Process:  50%|████▉     | 242/487 [00:40<00:55,  4.41it/s]Qunatization Process:  50%|████▉     | 243/487 [00:41<01:05,  3.74it/s]Qunatization Process:  51%|█████     | 248/487 [00:41<00:34,  6.94it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:41<00:35,  6.59it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:42<00:31,  7.37it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:43<00:52,  4.41it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:43<01:02,  3.74it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:43<00:32,  6.94it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:44<00:34,  6.59it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:44<00:30,  7.37it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:45<00:50,  4.38it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:45<00:58,  3.73it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:45<00:31,  6.89it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:46<00:34,  6.15it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:46<00:30,  6.92it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:47<00:48,  4.32it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:47<00:55,  3.72it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:48<00:29,  6.85it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:48<00:30,  6.57it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:48<00:27,  7.28it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:49<00:44,  4.42it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:50<00:52,  3.75it/s]Qunatization Process:  61%|██████    | 296/487 [00:50<00:27,  6.96it/s]Qunatization Process:  61%|██████    | 298/487 [00:50<00:28,  6.54it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:50<00:25,  7.30it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:51<00:42,  4.36it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:52<00:49,  3.70it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:52<00:26,  6.84it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:52<00:27,  6.52it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:52<00:23,  7.29it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:53<00:39,  4.40it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:54<00:45,  3.75it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:54<00:23,  6.97it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:54<00:24,  6.64it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:55<00:22,  7.41it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:55<00:35,  4.50it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:56<00:41,  3.82it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:56<00:21,  7.08it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:56<00:22,  6.70it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:57<00:20,  7.42it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:58<00:33,  4.50it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:58<00:38,  3.82it/s]Qunatization Process:  71%|███████   | 344/487 [00:58<00:20,  7.08it/s]Qunatization Process:  71%|███████   | 346/487 [00:59<00:20,  6.72it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:59<00:18,  7.50it/s]Qunatization Process:  72%|███████▏  | 350/487 [01:00<00:30,  4.53it/s]Qunatization Process:  72%|███████▏  | 351/487 [01:00<00:35,  3.85it/s]Qunatization Process:  73%|███████▎  | 356/487 [01:00<00:18,  7.08it/s]Qunatization Process:  74%|███████▎  | 358/487 [01:01<00:19,  6.64it/s]Qunatization Process:  74%|███████▍  | 360/487 [01:01<00:17,  7.43it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:02<00:28,  4.44it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:02<00:32,  3.76it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:03<00:17,  6.92it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:03<00:17,  6.62it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:03<00:15,  7.36it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:04<00:25,  4.46it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:04<00:29,  3.80it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:05<00:15,  6.99it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:05<00:15,  6.67it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:05<00:13,  7.38it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:06<00:22,  4.41it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:07<00:26,  3.73it/s]Qunatization Process:  80%|████████  | 392/487 [01:07<00:13,  6.88it/s]Qunatization Process:  81%|████████  | 394/487 [01:07<00:14,  6.50it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:07<00:12,  7.23it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:08<00:20,  4.33it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:09<00:24,  3.66it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:09<00:12,  6.72it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:09<00:12,  6.48it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:10<00:10,  7.29it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:11<00:17,  4.36it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:11<00:20,  3.72it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:11<00:10,  6.86it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:12<00:10,  6.55it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:12<00:09,  7.22it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:13<00:14,  4.33it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:13<00:17,  3.74it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:13<00:08,  6.95it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:14<00:08,  6.59it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:14<00:07,  7.40it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:15<00:11,  4.47it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:15<00:13,  3.82it/s]Qunatization Process:  90%|█████████ | 440/487 [01:15<00:06,  7.09it/s]Qunatization Process:  91%|█████████ | 442/487 [01:16<00:06,  6.74it/s]Qunatization Process:  91%|█████████ | 444/487 [01:16<00:05,  7.52it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:17<00:09,  4.52it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:17<00:10,  3.84it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:18<00:04,  7.05it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:18<00:04,  6.74it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:18<00:04,  7.46it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:19<00:06,  4.51it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:19<00:07,  3.84it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:20<00:03,  7.13it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:20<00:03,  6.80it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:20<00:02,  7.58it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:21<00:03,  4.60it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:22<00:04,  3.92it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:22<00:01,  7.24it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:22<00:01,  6.81it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:22<00:00,  7.62it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:23<00:01,  4.59it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:24<00:01,  3.91it/s]Qunatization Process: 100%|██████████| 487/487 [01:25<00:00,  3.81it/s]Qunatization Process: 100%|██████████| 487/487 [01:25<00:00,  5.72it/s]
+[02:28:55.488175] ## Processing on RANK 2.
+[02:30:25.586867] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:30:51.859355] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:09, 49.35it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:46, 10.21it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:12,  6.50it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:53,  8.80it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:57,  8.11it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:52,  8.78it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:29,  5.16it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:45,  4.35it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:58,  7.76it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:02,  7.23it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:56,  7.99it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:33,  4.78it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:49,  4.09it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<00:58,  7.53it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:01,  7.17it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:55,  7.98it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:31,  4.80it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:46,  4.09it/s]Qunatization Process:  11%|█▏        | 56/487 [00:08<00:56,  7.57it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:59,  7.19it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:53,  8.03it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:28,  4.81it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:43,  4.10it/s]Qunatization Process:  14%|█▍        | 68/487 [00:10<00:55,  7.58it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:57,  7.20it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:51,  8.04it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:25,  4.81it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:40,  4.09it/s]Qunatization Process:  16%|█▋        | 80/487 [00:12<00:53,  7.57it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:56,  7.19it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:50,  8.03it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:23,  4.82it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:37,  4.09it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:52,  7.58it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:54,  7.20it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:48,  8.04it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:20,  4.85it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:34,  4.10it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:50,  7.59it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:52,  7.20it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:47,  8.04it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:17,  4.84it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:31,  4.11it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:48,  7.61it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:51,  7.22it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:45,  8.05it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:15,  4.83it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:28,  4.09it/s]Qunatization Process:  26%|██▋       | 128/487 [00:19<00:47,  7.58it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:49,  7.20it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:44,  8.03it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:12,  4.84it/s]Qunatization Process:  28%|██▊       | 135/487 [00:21<01:25,  4.10it/s]Qunatization Process:  29%|██▊       | 140/487 [00:21<00:45,  7.59it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:47,  7.21it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:42,  8.04it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:10,  4.84it/s]Qunatization Process:  30%|███       | 147/487 [00:23<01:22,  4.10it/s]Qunatization Process:  31%|███       | 152/487 [00:23<00:44,  7.58it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:46,  7.20it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:41,  8.04it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:08,  4.83it/s]Qunatization Process:  33%|███▎      | 159/487 [00:25<01:19,  4.10it/s]Qunatization Process:  34%|███▎      | 164/487 [00:25<00:42,  7.57it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:44,  7.17it/s]Qunatization Process:  34%|███▍      | 168/487 [00:26<00:39,  8.00it/s]Qunatization Process:  35%|███▍      | 170/487 [00:27<01:05,  4.83it/s]Qunatization Process:  35%|███▌      | 171/487 [00:27<01:16,  4.11it/s]Qunatization Process:  36%|███▌      | 176/487 [00:27<00:41,  7.58it/s]Qunatization Process:  37%|███▋      | 178/487 [00:28<00:43,  7.17it/s]Qunatization Process:  37%|███▋      | 180/487 [00:28<00:38,  8.01it/s]Qunatization Process:  37%|███▋      | 182/487 [00:29<01:03,  4.83it/s]Qunatization Process:  38%|███▊      | 183/487 [00:29<01:13,  4.11it/s]Qunatization Process:  39%|███▊      | 188/487 [00:29<00:39,  7.59it/s]Qunatization Process:  39%|███▉      | 190/487 [00:30<00:41,  7.18it/s]Qunatization Process:  39%|███▉      | 192/487 [00:30<00:36,  8.02it/s]Qunatization Process:  40%|███▉      | 194/487 [00:31<01:00,  4.82it/s]Qunatization Process:  40%|████      | 195/487 [00:31<01:11,  4.10it/s]Qunatization Process:  41%|████      | 200/487 [00:31<00:37,  7.60it/s]Qunatization Process:  41%|████▏     | 202/487 [00:32<00:39,  7.21it/s]Qunatization Process:  42%|████▏     | 204/487 [00:32<00:35,  8.04it/s]Qunatization Process:  42%|████▏     | 206/487 [00:33<00:58,  4.84it/s]Qunatization Process:  43%|████▎     | 207/487 [00:33<01:08,  4.12it/s]Qunatization Process:  44%|████▎     | 212/487 [00:33<00:36,  7.61it/s]Qunatization Process:  44%|████▍     | 214/487 [00:33<00:37,  7.22it/s]Qunatization Process:  44%|████▍     | 216/487 [00:34<00:33,  8.06it/s]Qunatization Process:  45%|████▍     | 218/487 [00:35<00:55,  4.85it/s]Qunatization Process:  45%|████▍     | 219/487 [00:35<01:05,  4.12it/s]Qunatization Process:  46%|████▌     | 224/487 [00:35<00:34,  7.62it/s]Qunatization Process:  46%|████▋     | 226/487 [00:35<00:36,  7.20it/s]Qunatization Process:  47%|████▋     | 228/487 [00:36<00:32,  7.98it/s]Qunatization Process:  47%|████▋     | 230/487 [00:36<00:53,  4.82it/s]Qunatization Process:  47%|████▋     | 231/487 [00:37<01:02,  4.10it/s]Qunatization Process:  48%|████▊     | 236/487 [00:37<00:33,  7.58it/s]Qunatization Process:  49%|████▉     | 238/487 [00:37<00:34,  7.17it/s]Qunatization Process:  49%|████▉     | 240/487 [00:38<00:30,  7.98it/s]Qunatization Process:  50%|████▉     | 242/487 [00:38<00:50,  4.82it/s]Qunatization Process:  50%|████▉     | 243/487 [00:39<00:59,  4.10it/s]Qunatization Process:  51%|█████     | 248/487 [00:39<00:31,  7.59it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:39<00:32,  7.19it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:40<00:29,  7.99it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:40<00:48,  4.82it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:41<00:56,  4.10it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:41<00:29,  7.59it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:41<00:31,  7.21it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:42<00:27,  8.02it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:42<00:45,  4.82it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:43<00:53,  4.10it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:43<00:28,  7.59it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:43<00:29,  7.21it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:43<00:26,  8.01it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:44<00:43,  4.80it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:45<00:50,  4.09it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:45<00:26,  7.57it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:45<00:27,  7.19it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:45<00:24,  8.03it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:46<00:40,  4.81it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:47<00:47,  4.10it/s]Qunatization Process:  61%|██████    | 296/487 [00:47<00:25,  7.58it/s]Qunatization Process:  61%|██████    | 298/487 [00:47<00:26,  7.20it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:47<00:23,  8.03it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:48<00:38,  4.81it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:49<00:44,  4.09it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:49<00:23,  7.57it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:49<00:24,  7.17it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:49<00:21,  8.00it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:50<00:36,  4.80it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:51<00:42,  4.08it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:51<00:22,  7.55it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:51<00:23,  7.17it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:51<00:20,  8.01it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:52<00:33,  4.80it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:53<00:39,  4.08it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:53<00:20,  7.56it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:53<00:21,  7.17it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:53<00:18,  8.01it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:54<00:31,  4.79it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:55<00:36,  4.07it/s]Qunatization Process:  71%|███████   | 344/487 [00:55<00:18,  7.53it/s]Qunatization Process:  71%|███████   | 346/487 [00:55<00:19,  7.16it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:55<00:17,  8.00it/s]Qunatization Process:  72%|███████▏  | 350/487 [00:56<00:28,  4.79it/s]Qunatization Process:  72%|███████▏  | 351/487 [00:57<00:33,  4.08it/s]Qunatization Process:  73%|███████▎  | 356/487 [00:57<00:17,  7.55it/s]Qunatization Process:  74%|███████▎  | 358/487 [00:57<00:17,  7.17it/s]Qunatization Process:  74%|███████▍  | 360/487 [00:57<00:15,  8.00it/s]Qunatization Process:  74%|███████▍  | 362/487 [00:58<00:26,  4.81it/s]Qunatization Process:  75%|███████▍  | 363/487 [00:59<00:30,  4.09it/s]Qunatization Process:  76%|███████▌  | 368/487 [00:59<00:15,  7.57it/s]Qunatization Process:  76%|███████▌  | 370/487 [00:59<00:16,  7.19it/s]Qunatization Process:  76%|███████▋  | 372/487 [00:59<00:14,  8.03it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:00<00:23,  4.83it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:01<00:27,  4.11it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:01<00:14,  7.60it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:01<00:14,  7.21it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:01<00:12,  8.05it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:02<00:20,  4.84it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:03<00:24,  4.12it/s]Qunatization Process:  80%|████████  | 392/487 [01:03<00:12,  7.56it/s]Qunatization Process:  81%|████████  | 394/487 [01:03<00:13,  7.09it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:03<00:11,  7.87it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:04<00:19,  4.66it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:05<00:22,  3.95it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:05<00:11,  7.30it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:05<00:11,  6.90it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:05<00:10,  7.70it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:06<00:16,  4.59it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:07<00:19,  3.90it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:07<00:09,  7.22it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:07<00:10,  6.86it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:07<00:08,  7.63it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:08<00:14,  4.61it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:09<00:16,  3.92it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:09<00:08,  7.23it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:09<00:08,  6.87it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:09<00:07,  7.66it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:10<00:11,  4.61it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:11<00:13,  3.92it/s]Qunatization Process:  90%|█████████ | 440/487 [01:11<00:06,  7.24it/s]Qunatization Process:  91%|█████████ | 442/487 [01:11<00:06,  6.86it/s]Qunatization Process:  91%|█████████ | 444/487 [01:11<00:05,  7.65it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:12<00:08,  4.59it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:13<00:10,  3.91it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:13<00:04,  7.22it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:13<00:04,  6.86it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:14<00:04,  7.64it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:14<00:06,  4.60it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:15<00:07,  3.91it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:15<00:03,  7.24it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:15<00:03,  6.86it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:16<00:02,  7.66it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:17<00:03,  4.58it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:17<00:04,  3.90it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:17<00:01,  7.22it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:18<00:01,  6.84it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:18<00:00,  7.64it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:19<00:01,  4.58it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:19<00:01,  3.91it/s]Qunatization Process: 100%|██████████| 487/487 [01:20<00:00,  3.84it/s]Qunatization Process: 100%|██████████| 487/487 [01:20<00:00,  6.04it/s]
+[02:32:31.750325] ## Processing on RANK 3.
+[02:34:02.853381] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:34:33.223220] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 47.36it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:46, 10.12it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:12,  6.52it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:52,  8.84it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:56,  8.17it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:52,  8.86it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:27,  5.28it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:43,  4.45it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:57,  7.95it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:00,  7.48it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:54,  8.30it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:30,  4.98it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:46,  4.22it/s]Qunatization Process:   9%|▉         | 44/487 [00:05<00:57,  7.77it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<00:59,  7.36it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:53,  8.20it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:28,  4.94it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:43,  4.20it/s]Qunatization Process:  11%|█▏        | 56/487 [00:07<00:55,  7.74it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:58,  7.34it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:52,  8.18it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:26,  4.93it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:41,  4.19it/s]Qunatization Process:  14%|█▍        | 68/487 [00:09<00:54,  7.74it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:56,  7.35it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:50,  8.19it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:23,  4.93it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:38,  4.19it/s]Qunatization Process:  16%|█▋        | 80/487 [00:11<00:52,  7.72it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:55,  7.30it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:49,  8.14it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:23,  4.77it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:39,  4.04it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:52,  7.49it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:55,  7.10it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:49,  7.92it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:22,  4.69it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:37,  3.98it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:51,  7.37it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:54,  7.03it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:48,  7.86it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:20,  4.68it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:34,  3.96it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:50,  7.36it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:52,  7.00it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:46,  7.83it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:18,  4.67it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:31,  3.96it/s]Qunatization Process:  26%|██▋       | 128/487 [00:19<00:48,  7.34it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:50,  7.01it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:45,  7.82it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:15,  4.67it/s]Qunatization Process:  28%|██▊       | 135/487 [00:21<01:28,  3.96it/s]Qunatization Process:  29%|██▊       | 140/487 [00:21<00:47,  7.34it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:49,  6.98it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:43,  7.81it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:13,  4.66it/s]Qunatization Process:  30%|███       | 147/487 [00:23<01:26,  3.95it/s]Qunatization Process:  31%|███       | 152/487 [00:23<00:45,  7.33it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:47,  6.97it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:42,  7.79it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:10,  4.66it/s]Qunatization Process:  33%|███▎      | 159/487 [00:25<01:22,  3.96it/s]Qunatization Process:  34%|███▎      | 164/487 [00:25<00:44,  7.32it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:45,  6.99it/s]Qunatization Process:  34%|███▍      | 168/487 [00:26<00:40,  7.82it/s]Qunatization Process:  35%|███▍      | 170/487 [00:27<01:07,  4.67it/s]Qunatization Process:  35%|███▌      | 171/487 [00:27<01:19,  3.96it/s]Qunatization Process:  36%|███▌      | 176/487 [00:27<00:42,  7.34it/s]Qunatization Process:  37%|███▋      | 178/487 [00:28<00:44,  6.99it/s]Qunatization Process:  37%|███▋      | 180/487 [00:28<00:39,  7.82it/s]Qunatization Process:  37%|███▋      | 182/487 [00:29<01:05,  4.66it/s]Qunatization Process:  38%|███▊      | 183/487 [00:29<01:16,  3.96it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:40,  7.33it/s]Qunatization Process:  39%|███▉      | 190/487 [00:30<00:42,  7.00it/s]Qunatization Process:  39%|███▉      | 192/487 [00:30<00:37,  7.81it/s]Qunatization Process:  40%|███▉      | 194/487 [00:31<01:02,  4.67it/s]Qunatization Process:  40%|████      | 195/487 [00:31<01:13,  3.96it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:39,  7.35it/s]Qunatization Process:  41%|████▏     | 202/487 [00:32<00:40,  6.99it/s]Qunatization Process:  42%|████▏     | 204/487 [00:32<00:36,  7.82it/s]Qunatization Process:  42%|████▏     | 206/487 [00:33<01:00,  4.67it/s]Qunatization Process:  43%|████▎     | 207/487 [00:33<01:10,  3.96it/s]Qunatization Process:  44%|████▎     | 212/487 [00:34<00:37,  7.35it/s]Qunatization Process:  44%|████▍     | 214/487 [00:34<00:38,  7.01it/s]Qunatization Process:  44%|████▍     | 216/487 [00:34<00:34,  7.82it/s]Qunatization Process:  45%|████▍     | 218/487 [00:35<00:57,  4.67it/s]Qunatization Process:  45%|████▍     | 219/487 [00:35<01:07,  3.96it/s]Qunatization Process:  46%|████▌     | 224/487 [00:36<00:35,  7.35it/s]Qunatization Process:  46%|████▋     | 226/487 [00:36<00:37,  6.99it/s]Qunatization Process:  47%|████▋     | 228/487 [00:36<00:33,  7.82it/s]Qunatization Process:  47%|████▋     | 230/487 [00:37<00:55,  4.61it/s]Qunatization Process:  47%|████▋     | 231/487 [00:38<01:06,  3.86it/s]Qunatization Process:  48%|████▊     | 236/487 [00:38<00:35,  7.14it/s]Qunatization Process:  49%|████▉     | 238/487 [00:38<00:36,  6.74it/s]Qunatization Process:  49%|████▉     | 240/487 [00:38<00:32,  7.52it/s]Qunatization Process:  50%|████▉     | 242/487 [00:39<00:55,  4.45it/s]Qunatization Process:  50%|████▉     | 243/487 [00:40<01:04,  3.77it/s]Qunatization Process:  51%|█████     | 248/487 [00:40<00:34,  6.99it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:40<00:35,  6.66it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:40<00:31,  7.42it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:41<00:52,  4.43it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:42<01:01,  3.75it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:42<00:32,  6.95it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:42<00:33,  6.63it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:43<00:30,  7.40it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:43<00:49,  4.42it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:44<00:58,  3.75it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:44<00:30,  6.95it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:44<00:32,  6.62it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:45<00:28,  7.40it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:46<00:47,  4.42it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:46<00:55,  3.75it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:46<00:29,  6.95it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:47<00:30,  6.62it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:47<00:26,  7.40it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:48<00:44,  4.42it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:48<00:52,  3.75it/s]Qunatization Process:  61%|██████    | 296/487 [00:48<00:27,  6.95it/s]Qunatization Process:  61%|██████    | 298/487 [00:49<00:28,  6.62it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:49<00:25,  7.40it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:50<00:41,  4.42it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:50<00:49,  3.75it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:51<00:25,  6.95it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:51<00:26,  6.63it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:51<00:23,  7.39it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:52<00:39,  4.42it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:53<00:45,  3.75it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:53<00:24,  6.94it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:53<00:24,  6.63it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:53<00:21,  7.41it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:54<00:36,  4.42it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:55<00:42,  3.76it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:55<00:22,  6.97it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:55<00:23,  6.64it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:55<00:20,  7.42it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:56<00:33,  4.43it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:57<00:39,  3.76it/s]Qunatization Process:  71%|███████   | 344/487 [00:57<00:20,  6.98it/s]Qunatization Process:  71%|███████   | 346/487 [00:57<00:21,  6.65it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:58<00:18,  7.43it/s]Qunatization Process:  72%|███████▏  | 350/487 [00:58<00:30,  4.44it/s]Qunatization Process:  72%|███████▏  | 351/487 [00:59<00:36,  3.76it/s]Qunatization Process:  73%|███████▎  | 356/487 [00:59<00:18,  6.98it/s]Qunatization Process:  74%|███████▎  | 358/487 [00:59<00:19,  6.65it/s]Qunatization Process:  74%|███████▍  | 360/487 [01:00<00:17,  7.42it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:01<00:28,  4.44it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:01<00:32,  3.76it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:01<00:17,  6.97it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:02<00:17,  6.65it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:02<00:15,  7.43it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:03<00:25,  4.43it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:03<00:29,  3.76it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:03<00:15,  6.97it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:04<00:15,  6.64it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:04<00:13,  7.43it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:05<00:22,  4.42it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:05<00:26,  3.75it/s]Qunatization Process:  80%|████████  | 392/487 [01:06<00:13,  6.96it/s]Qunatization Process:  81%|████████  | 394/487 [01:06<00:14,  6.64it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:06<00:12,  7.42it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:07<00:20,  4.43it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:08<00:23,  3.76it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:08<00:11,  6.92it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:08<00:12,  6.60it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:08<00:10,  7.39it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:09<00:17,  4.40it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:10<00:20,  3.74it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:10<00:10,  6.94it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:10<00:10,  6.60it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:10<00:09,  7.38it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:11<00:14,  4.41it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:12<00:17,  3.75it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:12<00:08,  6.95it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:12<00:08,  6.61it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:13<00:07,  7.40it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:13<00:12,  4.41it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:14<00:13,  3.75it/s]Qunatization Process:  90%|█████████ | 440/487 [01:14<00:06,  6.95it/s]Qunatization Process:  91%|█████████ | 442/487 [01:14<00:06,  6.62it/s]Qunatization Process:  91%|█████████ | 444/487 [01:15<00:05,  7.41it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:16<00:09,  4.44it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:16<00:10,  3.77it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:16<00:05,  7.00it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:17<00:04,  6.67it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:17<00:04,  7.44it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:18<00:06,  4.46it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:18<00:07,  3.78it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:18<00:03,  7.00it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:19<00:03,  6.67it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:19<00:02,  7.44it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:20<00:03,  4.46it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:20<00:04,  3.79it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:21<00:01,  7.02it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:21<00:01,  6.67it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:21<00:00,  7.45it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:22<00:01,  4.47it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:22<00:01,  3.81it/s]Qunatization Process: 100%|██████████| 487/487 [01:24<00:00,  3.71it/s]Qunatization Process: 100%|██████████| 487/487 [01:24<00:00,  5.79it/s]
+[02:36:19.168640] ## Processing on RANK 4.
+[02:37:49.952459] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:38:19.693381] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 47.39it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:47,  9.94it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:13,  6.43it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:53,  8.73it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:57,  8.02it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:53,  8.72it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:28,  5.21it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:44,  4.40it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:57,  7.86it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:01,  7.40it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:54,  8.21it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:31,  4.93it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:46,  4.19it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<00:57,  7.70it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:00,  7.30it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:54,  8.10it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:29,  4.88it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:44,  4.16it/s]Qunatization Process:  11%|█▏        | 56/487 [00:07<00:56,  7.67it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:58,  7.28it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:52,  8.10it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:27,  4.88it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:42,  4.15it/s]Qunatization Process:  14%|█▍        | 68/487 [00:09<00:54,  7.67it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:57,  7.28it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:51,  8.11it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:24,  4.89it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:39,  4.15it/s]Qunatization Process:  16%|█▋        | 80/487 [00:11<00:53,  7.65it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:55,  7.24it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:50,  8.05it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:24,  4.74it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:39,  4.01it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:53,  7.43it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:55,  7.07it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:49,  7.86it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:23,  4.66it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:38,  3.95it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:52,  7.34it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:54,  6.97it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:48,  7.81it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:21,  4.64it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:35,  3.94it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:50,  7.30it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:52,  6.96it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:47,  7.73it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:21,  4.48it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:36,  3.78it/s]Qunatization Process:  26%|██▋       | 128/487 [00:20<00:51,  7.01it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:53,  6.67it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:47,  7.45it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:19,  4.44it/s]Qunatization Process:  28%|██▊       | 135/487 [00:22<01:33,  3.76it/s]Qunatization Process:  29%|██▊       | 140/487 [00:22<00:49,  6.98it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:51,  6.65it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:46,  7.43it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:17,  4.42it/s]Qunatization Process:  30%|███       | 147/487 [00:24<01:30,  3.75it/s]Qunatization Process:  31%|███       | 152/487 [00:24<00:48,  6.96it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:50,  6.63it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:44,  7.41it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:14,  4.42it/s]Qunatization Process:  33%|███▎      | 159/487 [00:26<01:27,  3.76it/s]Qunatization Process:  34%|███▎      | 164/487 [00:26<00:46,  6.97it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:48,  6.64it/s]Qunatization Process:  34%|███▍      | 168/487 [00:27<00:43,  7.41it/s]Qunatization Process:  35%|███▍      | 170/487 [00:28<01:11,  4.44it/s]Qunatization Process:  35%|███▌      | 171/487 [00:28<01:23,  3.77it/s]Qunatization Process:  36%|███▌      | 176/487 [00:28<00:44,  6.94it/s]Qunatization Process:  37%|███▋      | 178/487 [00:29<00:46,  6.62it/s]Qunatization Process:  37%|███▋      | 180/487 [00:29<00:41,  7.40it/s]Qunatization Process:  37%|███▋      | 182/487 [00:30<01:08,  4.44it/s]Qunatization Process:  38%|███▊      | 183/487 [00:30<01:20,  3.77it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:42,  7.01it/s]Qunatization Process:  39%|███▉      | 190/487 [00:31<00:44,  6.62it/s]Qunatization Process:  39%|███▉      | 192/487 [00:31<00:39,  7.42it/s]Qunatization Process:  40%|███▉      | 194/487 [00:32<01:05,  4.45it/s]Qunatization Process:  40%|████      | 195/487 [00:32<01:17,  3.78it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:40,  7.02it/s]Qunatization Process:  41%|████▏     | 202/487 [00:33<00:42,  6.66it/s]Qunatization Process:  42%|████▏     | 204/487 [00:33<00:38,  7.44it/s]Qunatization Process:  42%|████▏     | 206/487 [00:34<01:02,  4.47it/s]Qunatization Process:  43%|████▎     | 207/487 [00:34<01:13,  3.79it/s]Qunatization Process:  44%|████▎     | 212/487 [00:35<00:39,  6.97it/s]Qunatization Process:  44%|████▍     | 214/487 [00:35<00:41,  6.62it/s]Qunatization Process:  44%|████▍     | 216/487 [00:35<00:36,  7.38it/s]Qunatization Process:  45%|████▍     | 218/487 [00:36<01:00,  4.42it/s]Qunatization Process:  45%|████▍     | 219/487 [00:37<01:12,  3.72it/s]Qunatization Process:  46%|████▌     | 224/487 [00:37<00:38,  6.90it/s]Qunatization Process:  46%|████▋     | 226/487 [00:37<00:39,  6.60it/s]Qunatization Process:  47%|████▋     | 228/487 [00:37<00:35,  7.31it/s]Qunatization Process:  47%|████▋     | 230/487 [00:38<00:59,  4.34it/s]Qunatization Process:  47%|████▋     | 231/487 [00:39<01:09,  3.71it/s]Qunatization Process:  48%|████▊     | 236/487 [00:39<00:36,  6.82it/s]Qunatization Process:  49%|████▉     | 238/487 [00:39<00:38,  6.54it/s]Qunatization Process:  49%|████▉     | 240/487 [00:39<00:33,  7.33it/s]Qunatization Process:  50%|████▉     | 242/487 [00:40<00:55,  4.38it/s]Qunatization Process:  50%|████▉     | 243/487 [00:41<01:05,  3.72it/s]Qunatization Process:  51%|█████     | 248/487 [00:41<00:34,  6.91it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:41<00:36,  6.57it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:42<00:32,  7.34it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:43<00:53,  4.38it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:43<01:02,  3.70it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:43<00:32,  6.88it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:44<00:34,  6.52it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:44<00:30,  7.31it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:45<00:50,  4.37it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:45<00:59,  3.71it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:45<00:31,  6.89it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:46<00:32,  6.59it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:46<00:28,  7.38it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:47<00:47,  4.40it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:47<00:55,  3.74it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:48<00:29,  6.90it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:48<00:30,  6.58it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:48<00:27,  7.37it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:49<00:45,  4.36it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:50<00:52,  3.72it/s]Qunatization Process:  61%|██████    | 296/487 [00:50<00:27,  6.90it/s]Qunatization Process:  61%|██████    | 298/487 [00:50<00:28,  6.54it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:50<00:25,  7.31it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:51<00:41,  4.41it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:52<00:49,  3.75it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:52<00:25,  6.93it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:52<00:27,  6.53it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:52<00:23,  7.32it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:53<00:39,  4.40it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:54<00:46,  3.74it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:54<00:24,  6.94it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:54<00:25,  6.59it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:55<00:22,  7.33it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:56<00:36,  4.44it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:56<00:42,  3.75it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:56<00:22,  6.95it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:57<00:23,  6.63it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:57<00:20,  7.42it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:58<00:33,  4.46it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:58<00:39,  3.78it/s]Qunatization Process:  71%|███████   | 344/487 [00:58<00:20,  7.02it/s]Qunatization Process:  71%|███████   | 346/487 [00:59<00:21,  6.64it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:59<00:18,  7.44it/s]Qunatization Process:  72%|███████▏  | 350/487 [01:00<00:30,  4.50it/s]Qunatization Process:  72%|███████▏  | 351/487 [01:00<00:35,  3.82it/s]Qunatization Process:  73%|███████▎  | 356/487 [01:00<00:18,  6.98it/s]Qunatization Process:  74%|███████▎  | 358/487 [01:01<00:19,  6.68it/s]Qunatization Process:  74%|███████▍  | 360/487 [01:01<00:17,  7.39it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:02<00:28,  4.45it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:02<00:32,  3.78it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:03<00:17,  7.00it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:03<00:17,  6.65it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:03<00:15,  7.41it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:04<00:25,  4.50it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:05<00:29,  3.83it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:05<00:15,  7.09it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:05<00:15,  6.75it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:05<00:13,  7.47it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:06<00:22,  4.54it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:07<00:26,  3.84it/s]Qunatization Process:  80%|████████  | 392/487 [01:07<00:13,  7.04it/s]Qunatization Process:  81%|████████  | 394/487 [01:07<00:13,  6.72it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:07<00:12,  7.49it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:08<00:19,  4.51it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:09<00:22,  3.83it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:09<00:11,  7.08it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:09<00:12,  6.71it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:09<00:10,  7.51it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:10<00:17,  4.46it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:11<00:20,  3.77it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:11<00:10,  6.92it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:11<00:10,  6.54it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:12<00:09,  7.33it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:13<00:15,  4.32it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:13<00:17,  3.65it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:13<00:08,  6.75it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:14<00:08,  6.37it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:14<00:07,  7.09it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:15<00:12,  4.40it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:15<00:13,  3.78it/s]Qunatization Process:  90%|█████████ | 440/487 [01:15<00:06,  6.96it/s]Qunatization Process:  91%|█████████ | 442/487 [01:16<00:06,  6.64it/s]Qunatization Process:  91%|█████████ | 444/487 [01:16<00:05,  7.42it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:17<00:09,  4.49it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:17<00:10,  3.82it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:18<00:04,  7.01it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:18<00:04,  6.67it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:18<00:04,  7.38it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:19<00:06,  4.48it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:19<00:07,  3.84it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:20<00:03,  7.13it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:20<00:03,  6.74it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:20<00:02,  7.45it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:21<00:03,  4.53it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:22<00:04,  3.84it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:22<00:01,  7.05it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:22<00:01,  6.69it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:22<00:00,  7.49it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:23<00:01,  4.56it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:24<00:01,  3.87it/s]Qunatization Process: 100%|██████████| 487/487 [01:25<00:00,  3.79it/s]Qunatization Process: 100%|██████████| 487/487 [01:25<00:00,  5.71it/s]
+[02:40:06.622637] ## Processing on RANK 5.
+[02:41:37.439707] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:42:08.392864] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 45.30it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:49,  9.63it/s]Qunatization Process:   3%|▎         | 16/487 [00:02<01:16,  6.20it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:56,  8.33it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<01:04,  7.17it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:59,  7.81it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:36,  4.77it/s]Qunatization Process:   6%|▌         | 27/487 [00:04<01:53,  4.04it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<01:02,  7.23it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:06,  6.84it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:59,  7.60it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:39,  4.53it/s]Qunatization Process:   8%|▊         | 39/487 [00:06<01:56,  3.84it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<01:02,  7.07it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:05,  6.68it/s]Qunatization Process:  10%|▉         | 48/487 [00:07<00:58,  7.46it/s]Qunatization Process:  10%|█         | 50/487 [00:08<01:37,  4.49it/s]Qunatization Process:  10%|█         | 51/487 [00:08<01:52,  3.87it/s]Qunatization Process:  11%|█▏        | 56/487 [00:08<01:00,  7.17it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<01:02,  6.86it/s]Qunatization Process:  12%|█▏        | 60/487 [00:09<00:55,  7.69it/s]Qunatization Process:  13%|█▎        | 62/487 [00:10<01:31,  4.66it/s]Qunatization Process:  13%|█▎        | 63/487 [00:10<01:45,  4.00it/s]Qunatization Process:  14%|█▍        | 68/487 [00:10<00:56,  7.41it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:59,  7.03it/s]Qunatization Process:  15%|█▍        | 72/487 [00:11<00:52,  7.85it/s]Qunatization Process:  15%|█▌        | 74/487 [00:12<01:27,  4.74it/s]Qunatization Process:  15%|█▌        | 75/487 [00:12<01:42,  4.03it/s]Qunatization Process:  16%|█▋        | 80/487 [00:12<00:54,  7.44it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:57,  7.05it/s]Qunatization Process:  17%|█▋        | 84/487 [00:13<00:51,  7.86it/s]Qunatization Process:  18%|█▊        | 86/487 [00:14<01:30,  4.44it/s]Qunatization Process:  18%|█▊        | 87/487 [00:14<01:50,  3.62it/s]Qunatization Process:  19%|█▉        | 92/487 [00:14<00:59,  6.61it/s]Qunatization Process:  19%|█▉        | 94/487 [00:15<01:04,  6.13it/s]Qunatization Process:  20%|█▉        | 95/487 [00:15<01:06,  5.93it/s]Qunatization Process:  20%|█▉        | 97/487 [00:16<01:17,  5.05it/s]Qunatization Process:  20%|██        | 98/487 [00:16<01:39,  3.89it/s]Qunatization Process:  20%|██        | 99/487 [00:17<02:02,  3.18it/s]Qunatization Process:  21%|██▏       | 104/487 [00:17<00:59,  6.43it/s]Qunatization Process:  22%|██▏       | 106/487 [00:17<01:04,  5.95it/s]Qunatization Process:  22%|██▏       | 107/487 [00:17<01:05,  5.78it/s]Qunatization Process:  22%|██▏       | 109/487 [00:18<01:17,  4.89it/s]Qunatization Process:  23%|██▎       | 110/487 [00:19<01:40,  3.75it/s]Qunatization Process:  23%|██▎       | 111/487 [00:19<02:02,  3.08it/s]Qunatization Process:  24%|██▍       | 116/487 [00:19<00:57,  6.40it/s]Qunatization Process:  24%|██▍       | 118/487 [00:20<01:02,  5.91it/s]Qunatization Process:  24%|██▍       | 119/487 [00:20<01:04,  5.73it/s]Qunatization Process:  25%|██▍       | 121/487 [00:20<01:15,  4.85it/s]Qunatization Process:  25%|██▌       | 122/487 [00:21<01:37,  3.73it/s]Qunatization Process:  25%|██▌       | 123/487 [00:22<01:57,  3.10it/s]Qunatization Process:  26%|██▋       | 128/487 [00:22<00:56,  6.38it/s]Qunatization Process:  26%|██▋       | 129/487 [00:22<00:58,  6.14it/s]Qunatization Process:  27%|██▋       | 130/487 [00:22<01:00,  5.88it/s]Qunatization Process:  27%|██▋       | 131/487 [00:22<01:02,  5.68it/s]Qunatization Process:  27%|██▋       | 133/487 [00:23<01:09,  5.08it/s]Qunatization Process:  28%|██▊       | 134/487 [00:23<01:28,  3.97it/s]Qunatization Process:  28%|██▊       | 135/487 [00:24<01:45,  3.33it/s]Qunatization Process:  29%|██▊       | 140/487 [00:24<00:47,  7.34it/s]Qunatization Process:  29%|██▉       | 142/487 [00:24<00:50,  6.82it/s]Qunatization Process:  29%|██▉       | 143/487 [00:24<00:51,  6.63it/s]Qunatization Process:  30%|██▉       | 145/487 [00:25<01:01,  5.53it/s]Qunatization Process:  30%|██▉       | 146/487 [00:25<01:20,  4.23it/s]Qunatization Process:  30%|███       | 147/487 [00:26<01:37,  3.48it/s]Qunatization Process:  31%|███       | 152/487 [00:26<00:46,  7.24it/s]Qunatization Process:  32%|███▏      | 154/487 [00:27<00:49,  6.72it/s]Qunatization Process:  32%|███▏      | 155/487 [00:27<00:50,  6.55it/s]Qunatization Process:  32%|███▏      | 157/487 [00:27<00:59,  5.57it/s]Qunatization Process:  32%|███▏      | 158/487 [00:28<01:17,  4.26it/s]Qunatization Process:  33%|███▎      | 159/487 [00:28<01:34,  3.47it/s]Qunatization Process:  34%|███▎      | 164/487 [00:28<00:44,  7.24it/s]Qunatization Process:  34%|███▍      | 166/487 [00:29<00:47,  6.78it/s]Qunatization Process:  34%|███▍      | 167/487 [00:29<00:48,  6.60it/s]Qunatization Process:  35%|███▍      | 169/487 [00:29<00:57,  5.54it/s]Qunatization Process:  35%|███▍      | 170/487 [00:30<01:14,  4.27it/s]Qunatization Process:  35%|███▌      | 171/487 [00:30<01:30,  3.49it/s]Qunatization Process:  36%|███▌      | 176/487 [00:30<00:42,  7.26it/s]Qunatization Process:  37%|███▋      | 178/487 [00:31<00:45,  6.80it/s]Qunatization Process:  37%|███▋      | 179/487 [00:31<00:46,  6.61it/s]Qunatization Process:  37%|███▋      | 181/487 [00:32<00:55,  5.55it/s]Qunatization Process:  37%|███▋      | 182/487 [00:32<01:11,  4.28it/s]Qunatization Process:  38%|███▊      | 183/487 [00:32<01:27,  3.47it/s]Qunatization Process:  39%|███▊      | 188/487 [00:33<00:41,  7.24it/s]Qunatization Process:  39%|███▉      | 190/487 [00:33<00:44,  6.71it/s]Qunatization Process:  39%|███▉      | 191/487 [00:33<00:45,  6.54it/s]Qunatization Process:  40%|███▉      | 193/487 [00:34<00:52,  5.57it/s]Qunatization Process:  40%|███▉      | 194/487 [00:34<01:08,  4.27it/s]Qunatization Process:  40%|████      | 195/487 [00:35<01:23,  3.51it/s]Qunatization Process:  41%|████      | 200/487 [00:35<00:39,  7.30it/s]Qunatization Process:  41%|████▏     | 202/487 [00:35<00:41,  6.81it/s]Qunatization Process:  42%|████▏     | 203/487 [00:35<00:43,  6.60it/s]Qunatization Process:  42%|████▏     | 205/487 [00:36<00:50,  5.59it/s]Qunatization Process:  42%|████▏     | 206/487 [00:36<01:05,  4.28it/s]Qunatization Process:  43%|████▎     | 207/487 [00:37<01:19,  3.52it/s]Qunatization Process:  44%|████▎     | 212/487 [00:37<00:37,  7.29it/s]Qunatization Process:  44%|████▍     | 214/487 [00:37<00:40,  6.82it/s]Qunatization Process:  44%|████▍     | 215/487 [00:38<00:41,  6.61it/s]Qunatization Process:  45%|████▍     | 217/487 [00:38<00:48,  5.60it/s]Qunatization Process:  45%|████▍     | 218/487 [00:38<01:02,  4.29it/s]Qunatization Process:  45%|████▍     | 219/487 [00:39<01:16,  3.52it/s]Qunatization Process:  46%|████▌     | 224/487 [00:39<00:36,  7.28it/s]Qunatization Process:  46%|████▋     | 226/487 [00:39<00:38,  6.81it/s]Qunatization Process:  47%|████▋     | 227/487 [00:40<00:39,  6.56it/s]Qunatization Process:  47%|████▋     | 229/487 [00:40<00:46,  5.58it/s]Qunatization Process:  47%|████▋     | 230/487 [00:41<01:00,  4.26it/s]Qunatization Process:  47%|████▋     | 231/487 [00:41<01:12,  3.51it/s]Qunatization Process:  48%|████▊     | 236/487 [00:41<00:34,  7.29it/s]Qunatization Process:  49%|████▉     | 238/487 [00:42<00:36,  6.82it/s]Qunatization Process:  49%|████▉     | 239/487 [00:42<00:37,  6.63it/s]Qunatization Process:  49%|████▉     | 241/487 [00:42<00:43,  5.61it/s]Qunatization Process:  50%|████▉     | 242/487 [00:43<00:57,  4.29it/s]Qunatization Process:  50%|████▉     | 243/487 [00:43<01:09,  3.51it/s]Qunatization Process:  51%|█████     | 248/487 [00:43<00:32,  7.30it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:44<00:35,  6.77it/s]Qunatization Process:  52%|█████▏    | 251/487 [00:44<00:35,  6.58it/s]Qunatization Process:  52%|█████▏    | 253/487 [00:44<00:41,  5.59it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:45<00:54,  4.30it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:45<01:06,  3.51it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:46<00:31,  7.29it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:46<00:33,  6.81it/s]Qunatization Process:  54%|█████▍    | 263/487 [00:46<00:33,  6.61it/s]Qunatization Process:  54%|█████▍    | 265/487 [00:47<00:40,  5.54it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:47<00:51,  4.27it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:48<01:03,  3.48it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:48<00:29,  7.25it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:48<00:31,  6.79it/s]Qunatization Process:  56%|█████▋    | 275/487 [00:48<00:32,  6.59it/s]Qunatization Process:  57%|█████▋    | 277/487 [00:49<00:38,  5.52it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:49<00:49,  4.25it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:50<00:59,  3.51it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:50<00:27,  7.29it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:50<00:29,  6.81it/s]Qunatization Process:  59%|█████▉    | 287/487 [00:50<00:30,  6.60it/s]Qunatization Process:  59%|█████▉    | 289/487 [00:51<00:35,  5.59it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:51<00:45,  4.30it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:52<00:55,  3.52it/s]Qunatization Process:  61%|██████    | 296/487 [00:52<00:26,  7.31it/s]Qunatization Process:  61%|██████    | 298/487 [00:52<00:27,  6.83it/s]Qunatization Process:  61%|██████▏   | 299/487 [00:53<00:28,  6.63it/s]Qunatization Process:  62%|██████▏   | 301/487 [00:53<00:33,  5.58it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:54<00:43,  4.28it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:54<00:52,  3.51it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:54<00:24,  7.30it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:55<00:26,  6.80it/s]Qunatization Process:  64%|██████▍   | 311/487 [00:55<00:26,  6.61it/s]Qunatization Process:  64%|██████▍   | 313/487 [00:55<00:31,  5.59it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:56<00:40,  4.29it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:56<00:48,  3.52it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:56<00:22,  7.31it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:57<00:24,  6.81it/s]Qunatization Process:  66%|██████▋   | 323/487 [00:57<00:24,  6.61it/s]Qunatization Process:  67%|██████▋   | 325/487 [00:57<00:28,  5.59it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:58<00:37,  4.28it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:58<00:45,  3.52it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:59<00:21,  7.31it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:59<00:22,  6.78it/s]Qunatization Process:  69%|██████▉   | 335/487 [00:59<00:23,  6.59it/s]Qunatization Process:  69%|██████▉   | 337/487 [01:00<00:26,  5.58it/s]Qunatization Process:  69%|██████▉   | 338/487 [01:00<00:34,  4.28it/s]Qunatization Process:  70%|██████▉   | 339/487 [01:01<00:42,  3.52it/s]Qunatization Process:  71%|███████   | 344/487 [01:01<00:19,  7.30it/s]Qunatization Process:  71%|███████   | 346/487 [01:01<00:20,  6.81it/s]Qunatization Process:  71%|███████▏  | 347/487 [01:01<00:21,  6.48it/s]Qunatization Process:  72%|███████▏  | 349/487 [01:02<00:26,  5.22it/s]Qunatization Process:  72%|███████▏  | 350/487 [01:02<00:33,  4.13it/s]Qunatization Process:  72%|███████▏  | 351/487 [01:03<00:39,  3.44it/s]Qunatization Process:  73%|███████▎  | 356/487 [01:03<00:19,  6.61it/s]Qunatization Process:  73%|███████▎  | 357/487 [01:03<00:20,  6.46it/s]Qunatization Process:  74%|███████▎  | 358/487 [01:03<00:20,  6.33it/s]Qunatization Process:  74%|███████▎  | 359/487 [01:04<00:20,  6.21it/s]Qunatization Process:  74%|███████▍  | 361/487 [01:04<00:23,  5.33it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:05<00:30,  4.08it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:05<00:36,  3.35it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:05<00:16,  7.37it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:06<00:17,  6.79it/s]Qunatization Process:  76%|███████▌  | 371/487 [01:06<00:17,  6.57it/s]Qunatization Process:  77%|███████▋  | 373/487 [01:06<00:20,  5.43it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:07<00:28,  3.92it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:07<00:33,  3.32it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:07<00:15,  6.98it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:08<00:15,  6.60it/s]Qunatization Process:  79%|███████▊  | 383/487 [01:08<00:16,  6.42it/s]Qunatization Process:  79%|███████▉  | 385/487 [01:08<00:18,  5.51it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:09<00:23,  4.24it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:09<00:28,  3.50it/s]Qunatization Process:  80%|████████  | 392/487 [01:10<00:13,  7.30it/s]Qunatization Process:  81%|████████  | 394/487 [01:10<00:13,  6.86it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:10<00:11,  7.59it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:11<00:19,  4.46it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:12<00:23,  3.76it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:12<00:11,  7.03it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:12<00:12,  6.65it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:12<00:10,  7.45it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:13<00:17,  4.46it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:14<00:20,  3.76it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:14<00:10,  6.96it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:14<00:10,  6.53it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:14<00:09,  7.29it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:15<00:14,  4.35it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:16<00:17,  3.69it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:16<00:08,  6.84it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:16<00:08,  6.49it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:17<00:07,  7.18it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:18<00:12,  4.18it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:18<00:14,  3.61it/s]Qunatization Process:  90%|█████████ | 440/487 [01:18<00:06,  6.74it/s]Qunatization Process:  91%|█████████ | 442/487 [01:19<00:06,  6.45it/s]Qunatization Process:  91%|█████████ | 444/487 [01:19<00:05,  7.18it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:20<00:09,  4.42it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:20<00:10,  3.77it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:20<00:04,  7.00it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:21<00:04,  6.63it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:21<00:04,  7.43it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:22<00:06,  4.17it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:23<00:07,  3.59it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:23<00:03,  6.67it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:23<00:03,  6.39it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:23<00:02,  7.16it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:24<00:03,  4.33it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:25<00:04,  3.69it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:25<00:01,  6.77it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:25<00:01,  6.45it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:25<00:00,  7.19it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:26<00:01,  4.33it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:27<00:01,  3.68it/s]Qunatization Process: 100%|██████████| 487/487 [01:28<00:00,  3.59it/s]Qunatization Process: 100%|██████████| 487/487 [01:28<00:00,  5.50it/s]
+[02:43:58.527326] ## Processing on RANK 6.
+[02:45:29.140262] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:45:58.872212] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 47.34it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:46, 10.12it/s]Qunatization Process:   3%|▎         | 16/487 [00:01<01:12,  6.52it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:52,  8.83it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:56,  8.16it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:52,  8.85it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:27,  5.27it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:43,  4.44it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:57,  7.94it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:00,  7.49it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:54,  8.31it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:30,  4.99it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:45,  4.23it/s]Qunatization Process:   9%|▉         | 44/487 [00:05<00:56,  7.79it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<00:59,  7.38it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:53,  8.22it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:28,  4.95it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:43,  4.21it/s]Qunatization Process:  11%|█▏        | 56/487 [00:07<00:55,  7.77it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:58,  7.37it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:51,  8.21it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:25,  4.94it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:40,  4.20it/s]Qunatization Process:  14%|█▍        | 68/487 [00:09<00:53,  7.77it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:56,  7.36it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:50,  8.21it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:24,  4.87it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:40,  4.10it/s]Qunatization Process:  16%|█▋        | 80/487 [00:11<00:54,  7.51it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:57,  7.03it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:51,  7.81it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:26,  4.63it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:41,  3.95it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:53,  7.35it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:55,  7.02it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:49,  7.84it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:23,  4.68it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:37,  3.97it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:51,  7.37it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:54,  7.03it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:48,  7.85it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:20,  4.69it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:34,  3.98it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:50,  7.37it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:52,  7.03it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:46,  7.85it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:17,  4.68it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:31,  3.97it/s]Qunatization Process:  26%|██▋       | 128/487 [00:19<00:48,  7.37it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:50,  7.02it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:45,  7.85it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:15,  4.69it/s]Qunatization Process:  28%|██▊       | 135/487 [00:21<01:28,  3.98it/s]Qunatization Process:  29%|██▊       | 140/487 [00:21<00:47,  7.37it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:49,  7.02it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:43,  7.85it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:12,  4.68it/s]Qunatization Process:  30%|███       | 147/487 [00:23<01:25,  3.97it/s]Qunatization Process:  31%|███       | 152/487 [00:23<00:45,  7.37it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:47,  7.02it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:42,  7.85it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:10,  4.68it/s]Qunatization Process:  33%|███▎      | 159/487 [00:25<01:22,  3.97it/s]Qunatization Process:  34%|███▎      | 164/487 [00:25<00:43,  7.37it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:45,  7.02it/s]Qunatization Process:  34%|███▍      | 168/487 [00:26<00:40,  7.84it/s]Qunatization Process:  35%|███▍      | 170/487 [00:27<01:07,  4.68it/s]Qunatization Process:  35%|███▌      | 171/487 [00:27<01:19,  3.97it/s]Qunatization Process:  36%|███▌      | 176/487 [00:28<00:42,  7.37it/s]Qunatization Process:  37%|███▋      | 178/487 [00:28<00:44,  7.02it/s]Qunatization Process:  37%|███▋      | 180/487 [00:28<00:39,  7.84it/s]Qunatization Process:  37%|███▋      | 182/487 [00:29<01:05,  4.68it/s]Qunatization Process:  38%|███▊      | 183/487 [00:29<01:16,  3.97it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:40,  7.36it/s]Qunatization Process:  39%|███▉      | 190/487 [00:30<00:42,  7.02it/s]Qunatization Process:  39%|███▉      | 192/487 [00:30<00:37,  7.84it/s]Qunatization Process:  40%|███▉      | 194/487 [00:31<01:02,  4.68it/s]Qunatization Process:  40%|████      | 195/487 [00:31<01:13,  3.97it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:38,  7.37it/s]Qunatization Process:  41%|████▏     | 202/487 [00:32<00:40,  7.02it/s]Qunatization Process:  42%|████▏     | 204/487 [00:32<00:36,  7.84it/s]Qunatization Process:  42%|████▏     | 206/487 [00:33<01:00,  4.68it/s]Qunatization Process:  43%|████▎     | 207/487 [00:33<01:10,  3.97it/s]Qunatization Process:  44%|████▎     | 212/487 [00:34<00:37,  7.37it/s]Qunatization Process:  44%|████▍     | 214/487 [00:34<00:38,  7.02it/s]Qunatization Process:  44%|████▍     | 216/487 [00:34<00:34,  7.84it/s]Qunatization Process:  45%|████▍     | 218/487 [00:35<00:57,  4.68it/s]Qunatization Process:  45%|████▍     | 219/487 [00:35<01:07,  3.97it/s]Qunatization Process:  46%|████▌     | 224/487 [00:36<00:35,  7.36it/s]Qunatization Process:  46%|████▋     | 226/487 [00:36<00:37,  7.02it/s]Qunatization Process:  47%|████▋     | 228/487 [00:36<00:33,  7.84it/s]Qunatization Process:  47%|████▋     | 230/487 [00:37<00:54,  4.68it/s]Qunatization Process:  47%|████▋     | 231/487 [00:37<01:04,  3.97it/s]Qunatization Process:  48%|████▊     | 236/487 [00:38<00:34,  7.36it/s]Qunatization Process:  49%|████▉     | 238/487 [00:38<00:35,  7.02it/s]Qunatization Process:  49%|████▉     | 240/487 [00:38<00:31,  7.84it/s]Qunatization Process:  50%|████▉     | 242/487 [00:39<00:52,  4.68it/s]Qunatization Process:  50%|████▉     | 243/487 [00:40<01:01,  3.97it/s]Qunatization Process:  51%|█████     | 248/487 [00:40<00:32,  7.36it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:40<00:33,  7.02it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:40<00:29,  7.84it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:41<00:49,  4.68it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:42<00:58,  3.97it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:42<00:30,  7.36it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:42<00:32,  7.01it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:42<00:28,  7.84it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:43<00:47,  4.68it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:44<00:55,  3.97it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:44<00:29,  7.36it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:44<00:30,  7.01it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:44<00:26,  7.84it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:45<00:44,  4.68it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:46<00:52,  3.97it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:46<00:27,  7.36it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:46<00:28,  7.01it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:46<00:25,  7.84it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:47<00:42,  4.68it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:48<00:49,  3.97it/s]Qunatization Process:  61%|██████    | 296/487 [00:48<00:25,  7.36it/s]Qunatization Process:  61%|██████    | 298/487 [00:48<00:26,  7.01it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:48<00:23,  7.83it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:49<00:39,  4.68it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:50<00:46,  3.97it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:50<00:24,  7.36it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:50<00:25,  7.01it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:50<00:22,  7.84it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:51<00:37,  4.68it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:52<00:43,  3.97it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:52<00:22,  7.36it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:52<00:23,  7.01it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:52<00:20,  7.83it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:53<00:34,  4.67it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:54<00:40,  3.96it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:54<00:21,  7.35it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:54<00:21,  7.00it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:54<00:19,  7.83it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:55<00:31,  4.67it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:56<00:37,  3.97it/s]Qunatization Process:  71%|███████   | 344/487 [00:56<00:19,  7.35it/s]Qunatization Process:  71%|███████   | 346/487 [00:56<00:20,  7.01it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:56<00:17,  7.84it/s]Qunatization Process:  72%|███████▏  | 350/487 [00:57<00:29,  4.68it/s]Qunatization Process:  72%|███████▏  | 351/487 [00:58<00:34,  3.97it/s]Qunatization Process:  73%|███████▎  | 356/487 [00:58<00:17,  7.36it/s]Qunatization Process:  74%|███████▎  | 358/487 [00:58<00:18,  7.01it/s]Qunatization Process:  74%|███████▍  | 360/487 [00:58<00:16,  7.83it/s]Qunatization Process:  74%|███████▍  | 362/487 [00:59<00:26,  4.68it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:00<00:31,  3.97it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:00<00:16,  7.36it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:00<00:16,  7.01it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:00<00:14,  7.84it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:01<00:24,  4.68it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:02<00:28,  3.97it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:02<00:14,  7.36it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:02<00:14,  7.01it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:02<00:13,  7.84it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:03<00:21,  4.68it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:04<00:25,  3.97it/s]Qunatization Process:  80%|████████  | 392/487 [01:04<00:12,  7.36it/s]Qunatization Process:  81%|████████  | 394/487 [01:04<00:13,  7.01it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:04<00:11,  7.82it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:05<00:19,  4.68it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:06<00:22,  3.97it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:06<00:11,  7.36it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:06<00:11,  7.01it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:07<00:10,  7.83it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:07<00:16,  4.68it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:08<00:19,  3.97it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:08<00:09,  7.36it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:08<00:09,  7.01it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:09<00:08,  7.84it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:09<00:13,  4.68it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:10<00:16,  3.97it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:10<00:08,  7.36it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:10<00:08,  7.00it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:11<00:07,  7.82it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:11<00:11,  4.67it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:12<00:13,  3.97it/s]Qunatization Process:  90%|█████████ | 440/487 [01:12<00:06,  7.35it/s]Qunatization Process:  91%|█████████ | 442/487 [01:12<00:06,  7.00it/s]Qunatization Process:  91%|█████████ | 444/487 [01:13<00:05,  7.83it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:14<00:08,  4.69it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:14<00:10,  3.98it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:14<00:04,  7.38it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:14<00:04,  7.03it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:15<00:03,  7.85it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:16<00:06,  4.71it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:16<00:07,  3.99it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:16<00:03,  7.40it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:16<00:02,  7.04it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:17<00:02,  7.87it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:18<00:03,  4.71it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:18<00:04,  4.00it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:18<00:01,  7.41it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:18<00:01,  7.06it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:19<00:00,  7.88it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:20<00:01,  4.72it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:20<00:00,  4.00it/s]Qunatization Process: 100%|██████████| 487/487 [01:21<00:00,  3.91it/s]Qunatization Process: 100%|██████████| 487/487 [01:21<00:00,  5.97it/s]
+[02:47:39.423165] ## Processing on RANK 7.
+[02:49:10.582641] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/
+[02:49:42.853523] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/487 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/487 [00:00<00:10, 46.57it/s]Qunatization Process:   3%|▎         | 13/487 [00:01<00:49,  9.59it/s]Qunatization Process:   3%|▎         | 16/487 [00:02<01:15,  6.22it/s]Qunatization Process:   4%|▍         | 20/487 [00:02<00:55,  8.48it/s]Qunatization Process:   5%|▍         | 22/487 [00:02<00:58,  7.91it/s]Qunatization Process:   5%|▍         | 24/487 [00:02<00:53,  8.63it/s]Qunatization Process:   5%|▌         | 26/487 [00:03<01:28,  5.19it/s]Qunatization Process:   6%|▌         | 27/487 [00:03<01:44,  4.39it/s]Qunatization Process:   7%|▋         | 32/487 [00:04<00:57,  7.85it/s]Qunatization Process:   7%|▋         | 34/487 [00:04<01:01,  7.42it/s]Qunatization Process:   7%|▋         | 36/487 [00:04<00:54,  8.24it/s]Qunatization Process:   8%|▊         | 38/487 [00:05<01:30,  4.96it/s]Qunatization Process:   8%|▊         | 39/487 [00:05<01:46,  4.21it/s]Qunatization Process:   9%|▉         | 44/487 [00:06<00:57,  7.75it/s]Qunatization Process:   9%|▉         | 46/487 [00:06<01:00,  7.34it/s]Qunatization Process:  10%|▉         | 48/487 [00:06<00:53,  8.18it/s]Qunatization Process:  10%|█         | 50/487 [00:07<01:28,  4.92it/s]Qunatization Process:  10%|█         | 51/487 [00:07<01:44,  4.18it/s]Qunatization Process:  11%|█▏        | 56/487 [00:08<00:55,  7.73it/s]Qunatization Process:  12%|█▏        | 58/487 [00:08<00:58,  7.33it/s]Qunatization Process:  12%|█▏        | 60/487 [00:08<00:52,  8.18it/s]Qunatization Process:  13%|█▎        | 62/487 [00:09<01:26,  4.92it/s]Qunatization Process:  13%|█▎        | 63/487 [00:09<01:41,  4.18it/s]Qunatization Process:  14%|█▍        | 68/487 [00:09<00:54,  7.74it/s]Qunatization Process:  14%|█▍        | 70/487 [00:10<00:56,  7.33it/s]Qunatization Process:  15%|█▍        | 72/487 [00:10<00:50,  8.18it/s]Qunatization Process:  15%|█▌        | 74/487 [00:11<01:23,  4.92it/s]Qunatization Process:  15%|█▌        | 75/487 [00:11<01:38,  4.18it/s]Qunatization Process:  16%|█▋        | 80/487 [00:11<00:52,  7.74it/s]Qunatization Process:  17%|█▋        | 82/487 [00:12<00:55,  7.33it/s]Qunatization Process:  17%|█▋        | 84/487 [00:12<00:49,  8.18it/s]Qunatization Process:  18%|█▊        | 86/487 [00:13<01:22,  4.86it/s]Qunatization Process:  18%|█▊        | 87/487 [00:13<01:37,  4.11it/s]Qunatization Process:  19%|█▉        | 92/487 [00:13<00:51,  7.61it/s]Qunatization Process:  19%|█▉        | 94/487 [00:14<00:54,  7.21it/s]Qunatization Process:  20%|█▉        | 96/487 [00:14<00:48,  8.04it/s]Qunatization Process:  20%|██        | 98/487 [00:15<01:22,  4.73it/s]Qunatization Process:  20%|██        | 99/487 [00:15<01:37,  4.00it/s]Qunatization Process:  21%|██▏       | 104/487 [00:15<00:51,  7.40it/s]Qunatization Process:  22%|██▏       | 106/487 [00:16<00:54,  7.03it/s]Qunatization Process:  22%|██▏       | 108/487 [00:16<00:48,  7.86it/s]Qunatization Process:  23%|██▎       | 110/487 [00:17<01:20,  4.67it/s]Qunatization Process:  23%|██▎       | 111/487 [00:17<01:34,  3.96it/s]Qunatization Process:  24%|██▍       | 116/487 [00:17<00:50,  7.34it/s]Qunatization Process:  24%|██▍       | 118/487 [00:18<00:52,  7.00it/s]Qunatization Process:  25%|██▍       | 120/487 [00:18<00:46,  7.81it/s]Qunatization Process:  25%|██▌       | 122/487 [00:19<01:18,  4.66it/s]Qunatization Process:  25%|██▌       | 123/487 [00:19<01:31,  3.96it/s]Qunatization Process:  26%|██▋       | 128/487 [00:19<00:48,  7.34it/s]Qunatization Process:  27%|██▋       | 130/487 [00:20<00:50,  7.01it/s]Qunatization Process:  27%|██▋       | 132/487 [00:20<00:45,  7.82it/s]Qunatization Process:  28%|██▊       | 134/487 [00:21<01:15,  4.68it/s]Qunatization Process:  28%|██▊       | 135/487 [00:21<01:28,  3.97it/s]Qunatization Process:  29%|██▊       | 140/487 [00:21<00:47,  7.34it/s]Qunatization Process:  29%|██▉       | 142/487 [00:22<00:49,  7.01it/s]Qunatization Process:  30%|██▉       | 144/487 [00:22<00:43,  7.82it/s]Qunatization Process:  30%|██▉       | 146/487 [00:23<01:13,  4.67it/s]Qunatization Process:  30%|███       | 147/487 [00:23<01:25,  3.96it/s]Qunatization Process:  31%|███       | 152/487 [00:23<00:45,  7.35it/s]Qunatization Process:  32%|███▏      | 154/487 [00:24<00:47,  7.00it/s]Qunatization Process:  32%|███▏      | 156/487 [00:24<00:42,  7.83it/s]Qunatization Process:  32%|███▏      | 158/487 [00:25<01:10,  4.67it/s]Qunatization Process:  33%|███▎      | 159/487 [00:25<01:22,  3.96it/s]Qunatization Process:  34%|███▎      | 164/487 [00:26<00:43,  7.36it/s]Qunatization Process:  34%|███▍      | 166/487 [00:26<00:45,  7.00it/s]Qunatization Process:  34%|███▍      | 168/487 [00:26<00:40,  7.83it/s]Qunatization Process:  35%|███▍      | 170/487 [00:27<01:07,  4.67it/s]Qunatization Process:  35%|███▌      | 171/487 [00:27<01:19,  3.96it/s]Qunatization Process:  36%|███▌      | 176/487 [00:28<00:42,  7.35it/s]Qunatization Process:  37%|███▋      | 178/487 [00:28<00:44,  7.00it/s]Qunatization Process:  37%|███▋      | 180/487 [00:28<00:39,  7.83it/s]Qunatization Process:  37%|███▋      | 182/487 [00:29<01:06,  4.60it/s]Qunatization Process:  38%|███▊      | 183/487 [00:29<01:18,  3.86it/s]Qunatization Process:  39%|███▊      | 188/487 [00:30<00:41,  7.12it/s]Qunatization Process:  39%|███▉      | 190/487 [00:30<00:43,  6.76it/s]Qunatization Process:  39%|███▉      | 192/487 [00:30<00:39,  7.51it/s]Qunatization Process:  40%|███▉      | 194/487 [00:31<01:05,  4.45it/s]Qunatization Process:  40%|████      | 195/487 [00:32<01:17,  3.77it/s]Qunatization Process:  41%|████      | 200/487 [00:32<00:41,  6.99it/s]Qunatization Process:  41%|████▏     | 202/487 [00:32<00:42,  6.65it/s]Qunatization Process:  42%|████▏     | 204/487 [00:32<00:38,  7.43it/s]Qunatization Process:  42%|████▏     | 206/487 [00:33<01:03,  4.43it/s]Qunatization Process:  43%|████▎     | 207/487 [00:34<01:14,  3.75it/s]Qunatization Process:  44%|████▎     | 212/487 [00:34<00:39,  6.96it/s]Qunatization Process:  44%|████▍     | 214/487 [00:34<00:41,  6.62it/s]Qunatization Process:  44%|████▍     | 216/487 [00:34<00:36,  7.39it/s]Qunatization Process:  45%|████▍     | 218/487 [00:35<01:00,  4.42it/s]Qunatization Process:  45%|████▍     | 219/487 [00:36<01:11,  3.75it/s]Qunatization Process:  46%|████▌     | 224/487 [00:36<00:37,  6.96it/s]Qunatization Process:  46%|████▋     | 226/487 [00:36<00:39,  6.62it/s]Qunatization Process:  47%|████▋     | 228/487 [00:37<00:34,  7.41it/s]Qunatization Process:  47%|████▋     | 230/487 [00:38<00:58,  4.43it/s]Qunatization Process:  47%|████▋     | 231/487 [00:38<01:08,  3.76it/s]Qunatization Process:  48%|████▊     | 236/487 [00:38<00:36,  6.97it/s]Qunatization Process:  49%|████▉     | 238/487 [00:39<00:37,  6.64it/s]Qunatization Process:  49%|████▉     | 240/487 [00:39<00:33,  7.43it/s]Qunatization Process:  50%|████▉     | 242/487 [00:40<00:55,  4.44it/s]Qunatization Process:  50%|████▉     | 243/487 [00:40<01:04,  3.77it/s]Qunatization Process:  51%|█████     | 248/487 [00:40<00:34,  6.98it/s]Qunatization Process:  51%|█████▏    | 250/487 [00:41<00:35,  6.66it/s]Qunatization Process:  52%|█████▏    | 252/487 [00:41<00:31,  7.43it/s]Qunatization Process:  52%|█████▏    | 254/487 [00:42<00:52,  4.43it/s]Qunatization Process:  52%|█████▏    | 255/487 [00:42<01:01,  3.76it/s]Qunatization Process:  53%|█████▎    | 260/487 [00:42<00:32,  6.98it/s]Qunatization Process:  54%|█████▍    | 262/487 [00:43<00:33,  6.64it/s]Qunatization Process:  54%|█████▍    | 264/487 [00:43<00:30,  7.43it/s]Qunatization Process:  55%|█████▍    | 266/487 [00:44<00:49,  4.44it/s]Qunatization Process:  55%|█████▍    | 267/487 [00:44<00:58,  3.76it/s]Qunatization Process:  56%|█████▌    | 272/487 [00:45<00:30,  6.96it/s]Qunatization Process:  56%|█████▋    | 274/487 [00:45<00:32,  6.64it/s]Qunatization Process:  57%|█████▋    | 276/487 [00:45<00:28,  7.42it/s]Qunatization Process:  57%|█████▋    | 278/487 [00:46<00:46,  4.45it/s]Qunatization Process:  57%|█████▋    | 279/487 [00:47<00:54,  3.80it/s]Qunatization Process:  58%|█████▊    | 284/487 [00:47<00:28,  7.03it/s]Qunatization Process:  59%|█████▊    | 286/487 [00:47<00:30,  6.65it/s]Qunatization Process:  59%|█████▉    | 288/487 [00:47<00:26,  7.43it/s]Qunatization Process:  60%|█████▉    | 290/487 [00:48<00:44,  4.42it/s]Qunatization Process:  60%|█████▉    | 291/487 [00:49<00:52,  3.75it/s]Qunatization Process:  61%|██████    | 296/487 [00:49<00:27,  6.94it/s]Qunatization Process:  61%|██████    | 298/487 [00:49<00:28,  6.54it/s]Qunatization Process:  62%|██████▏   | 300/487 [00:49<00:25,  7.33it/s]Qunatization Process:  62%|██████▏   | 302/487 [00:50<00:41,  4.42it/s]Qunatization Process:  62%|██████▏   | 303/487 [00:51<00:48,  3.76it/s]Qunatization Process:  63%|██████▎   | 308/487 [00:51<00:25,  6.96it/s]Qunatization Process:  64%|██████▎   | 310/487 [00:51<00:26,  6.60it/s]Qunatization Process:  64%|██████▍   | 312/487 [00:52<00:23,  7.39it/s]Qunatization Process:  64%|██████▍   | 314/487 [00:53<00:38,  4.44it/s]Qunatization Process:  65%|██████▍   | 315/487 [00:53<00:45,  3.76it/s]Qunatization Process:  66%|██████▌   | 320/487 [00:53<00:24,  6.91it/s]Qunatization Process:  66%|██████▌   | 322/487 [00:54<00:24,  6.61it/s]Qunatization Process:  67%|██████▋   | 324/487 [00:54<00:22,  7.31it/s]Qunatization Process:  67%|██████▋   | 326/487 [00:55<00:36,  4.43it/s]Qunatization Process:  67%|██████▋   | 327/487 [00:55<00:42,  3.78it/s]Qunatization Process:  68%|██████▊   | 332/487 [00:55<00:22,  7.01it/s]Qunatization Process:  69%|██████▊   | 334/487 [00:56<00:22,  6.68it/s]Qunatization Process:  69%|██████▉   | 336/487 [00:56<00:20,  7.46it/s]Qunatization Process:  69%|██████▉   | 338/487 [00:57<00:33,  4.48it/s]Qunatization Process:  70%|██████▉   | 339/487 [00:57<00:39,  3.79it/s]Qunatization Process:  71%|███████   | 344/487 [00:57<00:20,  7.03it/s]Qunatization Process:  71%|███████   | 346/487 [00:58<00:21,  6.63it/s]Qunatization Process:  71%|███████▏  | 348/487 [00:58<00:18,  7.33it/s]Qunatization Process:  72%|███████▏  | 350/487 [00:59<00:32,  4.25it/s]Qunatization Process:  72%|███████▏  | 351/487 [00:59<00:37,  3.67it/s]Qunatization Process:  73%|███████▎  | 356/487 [01:00<00:19,  6.85it/s]Qunatization Process:  74%|███████▎  | 358/487 [01:00<00:19,  6.59it/s]Qunatization Process:  74%|███████▍  | 360/487 [01:00<00:17,  7.40it/s]Qunatization Process:  74%|███████▍  | 362/487 [01:01<00:27,  4.50it/s]Qunatization Process:  75%|███████▍  | 363/487 [01:02<00:32,  3.84it/s]Qunatization Process:  76%|███████▌  | 368/487 [01:02<00:16,  7.12it/s]Qunatization Process:  76%|███████▌  | 370/487 [01:02<00:17,  6.72it/s]Qunatization Process:  76%|███████▋  | 372/487 [01:02<00:15,  7.52it/s]Qunatization Process:  77%|███████▋  | 374/487 [01:03<00:25,  4.50it/s]Qunatization Process:  77%|███████▋  | 375/487 [01:04<00:29,  3.82it/s]Qunatization Process:  78%|███████▊  | 380/487 [01:04<00:15,  7.08it/s]Qunatization Process:  78%|███████▊  | 382/487 [01:04<00:15,  6.76it/s]Qunatization Process:  79%|███████▉  | 384/487 [01:04<00:13,  7.47it/s]Qunatization Process:  79%|███████▉  | 386/487 [01:05<00:22,  4.52it/s]Qunatization Process:  79%|███████▉  | 387/487 [01:06<00:25,  3.85it/s]Qunatization Process:  80%|████████  | 392/487 [01:06<00:13,  7.13it/s]Qunatization Process:  81%|████████  | 394/487 [01:06<00:13,  6.79it/s]Qunatization Process:  81%|████████▏ | 396/487 [01:06<00:11,  7.59it/s]Qunatization Process:  82%|████████▏ | 398/487 [01:07<00:19,  4.56it/s]Qunatization Process:  82%|████████▏ | 399/487 [01:08<00:22,  3.89it/s]Qunatization Process:  83%|████████▎ | 404/487 [01:08<00:11,  7.14it/s]Qunatization Process:  83%|████████▎ | 406/487 [01:08<00:11,  6.79it/s]Qunatization Process:  84%|████████▍ | 408/487 [01:09<00:10,  7.59it/s]Qunatization Process:  84%|████████▍ | 410/487 [01:09<00:16,  4.54it/s]Qunatization Process:  84%|████████▍ | 411/487 [01:10<00:19,  3.87it/s]Qunatization Process:  85%|████████▌ | 416/487 [01:10<00:09,  7.17it/s]Qunatization Process:  86%|████████▌ | 418/487 [01:10<00:10,  6.75it/s]Qunatization Process:  86%|████████▌ | 420/487 [01:11<00:08,  7.50it/s]Qunatization Process:  87%|████████▋ | 422/487 [01:12<00:14,  4.42it/s]Qunatization Process:  87%|████████▋ | 423/487 [01:12<00:17,  3.74it/s]Qunatization Process:  88%|████████▊ | 428/487 [01:12<00:08,  6.92it/s]Qunatization Process:  88%|████████▊ | 430/487 [01:13<00:08,  6.56it/s]Qunatization Process:  89%|████████▊ | 432/487 [01:13<00:07,  7.30it/s]Qunatization Process:  89%|████████▉ | 434/487 [01:14<00:12,  4.37it/s]Qunatization Process:  89%|████████▉ | 435/487 [01:14<00:14,  3.71it/s]Qunatization Process:  90%|█████████ | 440/487 [01:14<00:06,  6.87it/s]Qunatization Process:  91%|█████████ | 442/487 [01:15<00:06,  6.53it/s]Qunatization Process:  91%|█████████ | 444/487 [01:15<00:05,  7.30it/s]Qunatization Process:  92%|█████████▏| 446/487 [01:16<00:09,  4.33it/s]Qunatization Process:  92%|█████████▏| 447/487 [01:16<00:10,  3.70it/s]Qunatization Process:  93%|█████████▎| 452/487 [01:17<00:05,  6.86it/s]Qunatization Process:  93%|█████████▎| 454/487 [01:17<00:05,  6.47it/s]Qunatization Process:  94%|█████████▎| 456/487 [01:17<00:04,  7.24it/s]Qunatization Process:  94%|█████████▍| 458/487 [01:18<00:06,  4.36it/s]Qunatization Process:  94%|█████████▍| 459/487 [01:19<00:07,  3.69it/s]Qunatization Process:  95%|█████████▌| 464/487 [01:19<00:03,  6.84it/s]Qunatization Process:  96%|█████████▌| 466/487 [01:19<00:03,  6.52it/s]Qunatization Process:  96%|█████████▌| 468/487 [01:19<00:02,  7.21it/s]Qunatization Process:  97%|█████████▋| 470/487 [01:20<00:03,  4.36it/s]Qunatization Process:  97%|█████████▋| 471/487 [01:21<00:04,  3.71it/s]Qunatization Process:  98%|█████████▊| 476/487 [01:21<00:01,  6.87it/s]Qunatization Process:  98%|█████████▊| 478/487 [01:21<00:01,  6.54it/s]Qunatization Process:  99%|█████████▊| 480/487 [01:22<00:00,  7.27it/s]Qunatization Process:  99%|█████████▉| 482/487 [01:22<00:01,  4.40it/s]Qunatization Process:  99%|█████████▉| 483/487 [01:23<00:01,  3.74it/s]Qunatization Process: 100%|██████████| 487/487 [01:24<00:00,  3.66it/s]Qunatization Process: 100%|██████████| 487/487 [01:24<00:00,  5.76it/s]
+[02:51:30.101162] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-39): 40 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+    )
+  )
+)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 2, which does not have an explicit index. FSDP will use the current device 2. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 3, which does not have an explicit index. FSDP will use the current device 3. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 6, which does not have an explicit index. FSDP will use the current device 6. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 5, which does not have an explicit index. FSDP will use the current device 5. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 4, which does not have an explicit index. FSDP will use the current device 4. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 7, which does not have an explicit index. FSDP will use the current device 7. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[02:51:32.226200] apply gradient checkpointing
+[02:51:32.254412] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-39): 40 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=13824, out_features=5120, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=5120, out_features=13824, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([5120]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=5120, out_features=32000, bias=False)
+      )
+    )
+  )
+)
+[02:51:32.254451] effective batch size: 64
+[02:51:32.257916] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[02:51:32.258033] read dataset config from configs/data/finetune/sg/alpaca.yaml
+[02:51:32.258743] DATASET CONFIG:
+[02:51:32.258759] {'META': [['../data/alpaca_gpt4_data.json', 'text']]}
+[02:51:32.404102] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[02:51:32.406026] total length: 52002
+[02:51:32.417512] <data.alpaca.FinetuneDataset object at 0x7f078c1fdf90>
+[02:51:32.419486] Start training for 4 epochs
+[02:51:32.424561] log_dir: ./output_dir
+[02:51:37.180990] Epoch: [0]  [0/812]  lr: 0.000000  grad_norm: 1.2368 (1.2368)  closs: 1.0616 (1.0616)  time: 4.7553  data: 1.6005  max mem: 18726
+[02:51:56.653801] Epoch: [0]  [10/812]  lr: 0.000001  grad_norm: 1.3463 (1.3196)  closs: 1.0616 (1.0548)  time: 2.2025  data: 0.1457  max mem: 27896
+[02:52:15.510287] Epoch: [0]  [20/812]  lr: 0.000001  grad_norm: 1.3090 (1.2957)  closs: 1.0002 (1.0350)  time: 1.9164  data: 0.0002  max mem: 27896
+[02:52:34.499639] Epoch: [0]  [30/812]  lr: 0.000002  grad_norm: 1.2693 (1.2965)  closs: 1.0493 (1.0553)  time: 1.8922  data: 0.0002  max mem: 27896
+[02:52:53.410651] Epoch: [0]  [40/812]  lr: 0.000002  grad_norm: 1.2444 (1.2760)  closs: 1.0673 (1.0619)  time: 1.8949  data: 0.0002  max mem: 27896
+[02:53:12.478456] Epoch: [0]  [50/812]  lr: 0.000003  grad_norm: 1.1888 (1.2653)  closs: 1.0609 (1.0622)  time: 1.8988  data: 0.0002  max mem: 27896
+[02:53:31.393636] Epoch: [0]  [60/812]  lr: 0.000004  grad_norm: 1.1431 (1.2403)  closs: 1.0390 (1.0578)  time: 1.8991  data: 0.0002  max mem: 27896
+[02:53:50.403916] Epoch: [0]  [70/812]  lr: 0.000004  grad_norm: 1.0537 (1.2016)  closs: 1.0404 (1.0618)  time: 1.8962  data: 0.0002  max mem: 27896
+[02:54:09.283519] Epoch: [0]  [80/812]  lr: 0.000005  grad_norm: 0.9215 (1.1737)  closs: 1.0404 (1.0612)  time: 1.8944  data: 0.0002  max mem: 27896
+[02:54:28.281118] Epoch: [0]  [90/812]  lr: 0.000006  grad_norm: 0.8322 (1.1354)  closs: 1.0057 (1.0516)  time: 1.8937  data: 0.0002  max mem: 27896
+[02:54:47.146602] Epoch: [0]  [100/812]  lr: 0.000006  grad_norm: 0.7272 (1.0881)  closs: 0.9742 (1.0478)  time: 1.8931  data: 0.0002  max mem: 27896
+[02:55:06.025584] Epoch: [0]  [110/812]  lr: 0.000007  grad_norm: 0.6155 (1.0440)  closs: 0.9687 (1.0375)  time: 1.8871  data: 0.0002  max mem: 27896
+[02:55:24.976544] Epoch: [0]  [120/812]  lr: 0.000007  grad_norm: 0.5469 (0.9996)  closs: 0.9673 (1.0377)  time: 1.8914  data: 0.0002  max mem: 27896
+[02:55:43.830404] Epoch: [0]  [130/812]  lr: 0.000008  grad_norm: 0.4990 (0.9606)  closs: 0.9881 (1.0344)  time: 1.8902  data: 0.0002  max mem: 27896
+[02:56:02.840447] Epoch: [0]  [140/812]  lr: 0.000009  grad_norm: 0.4640 (0.9261)  closs: 0.9172 (1.0263)  time: 1.8931  data: 0.0002  max mem: 27896
+[02:56:21.715218] Epoch: [0]  [150/812]  lr: 0.000009  grad_norm: 0.4640 (0.8962)  closs: 0.9242 (1.0218)  time: 1.8942  data: 0.0002  max mem: 27896
+[02:56:40.929946] Epoch: [0]  [160/812]  lr: 0.000010  grad_norm: 0.4432 (0.8666)  closs: 0.9550 (1.0186)  time: 1.9044  data: 0.0002  max mem: 27896
+[02:56:59.814846] Epoch: [0]  [170/812]  lr: 0.000010  grad_norm: 0.4047 (0.8400)  closs: 0.9260 (1.0121)  time: 1.9049  data: 0.0002  max mem: 27896
+[02:57:18.798007] Epoch: [0]  [180/812]  lr: 0.000011  grad_norm: 0.3673 (0.8123)  closs: 0.8709 (1.0037)  time: 1.8933  data: 0.0002  max mem: 27896
+[02:57:37.654408] Epoch: [0]  [190/812]  lr: 0.000012  grad_norm: 0.3308 (0.7864)  closs: 0.8709 (0.9977)  time: 1.8919  data: 0.0002  max mem: 27896
+[02:57:56.600103] Epoch: [0]  [200/812]  lr: 0.000012  grad_norm: 0.3035 (0.7639)  closs: 0.9270 (0.9951)  time: 1.8900  data: 0.0002  max mem: 27896
+[02:58:15.461031] Epoch: [0]  [210/812]  lr: 0.000013  grad_norm: 0.3035 (0.7424)  closs: 0.9337 (0.9907)  time: 1.8903  data: 0.0002  max mem: 27896
+[02:58:34.481503] Epoch: [0]  [220/812]  lr: 0.000014  grad_norm: 0.3010 (0.7223)  closs: 0.9109 (0.9876)  time: 1.8940  data: 0.0002  max mem: 27896
+[02:58:53.318663] Epoch: [0]  [230/812]  lr: 0.000014  grad_norm: 0.2931 (0.7045)  closs: 0.8898 (0.9813)  time: 1.8928  data: 0.0002  max mem: 27896
+[02:59:12.273142] Epoch: [0]  [240/812]  lr: 0.000015  grad_norm: 0.2934 (0.6879)  closs: 0.8515 (0.9774)  time: 1.8895  data: 0.0002  max mem: 27896
+[02:59:31.148548] Epoch: [0]  [250/812]  lr: 0.000015  grad_norm: 0.3028 (0.6734)  closs: 0.9104 (0.9749)  time: 1.8914  data: 0.0002  max mem: 27896
+[02:59:50.126862] Epoch: [0]  [260/812]  lr: 0.000016  grad_norm: 0.2997 (0.6589)  closs: 0.9027 (0.9712)  time: 1.8926  data: 0.0002  max mem: 27896
+[03:00:08.984543] Epoch: [0]  [270/812]  lr: 0.000017  grad_norm: 0.2848 (0.6456)  closs: 0.9058 (0.9701)  time: 1.8917  data: 0.0002  max mem: 27896
+[03:00:27.916360] Epoch: [0]  [280/812]  lr: 0.000017  grad_norm: 0.2850 (0.6339)  closs: 0.9158 (0.9670)  time: 1.8894  data: 0.0002  max mem: 27896
+[03:00:46.795221] Epoch: [0]  [290/812]  lr: 0.000018  grad_norm: 0.2990 (0.6227)  closs: 0.8726 (0.9636)  time: 1.8905  data: 0.0002  max mem: 27896
+[03:01:05.829556] Epoch: [0]  [300/812]  lr: 0.000018  grad_norm: 0.3051 (0.6121)  closs: 0.8665 (0.9607)  time: 1.8956  data: 0.0002  max mem: 27896
+[03:01:24.744180] Epoch: [0]  [310/812]  lr: 0.000019  grad_norm: 0.3069 (0.6027)  closs: 0.8881 (0.9598)  time: 1.8974  data: 0.0002  max mem: 27896
+[03:01:43.610809] Epoch: [0]  [320/812]  lr: 0.000020  grad_norm: 0.3030 (0.5933)  closs: 0.9071 (0.9591)  time: 1.8890  data: 0.0003  max mem: 27896
+[03:02:02.590087] Epoch: [0]  [330/812]  lr: 0.000020  grad_norm: 0.2972 (0.5858)  closs: 0.8722 (0.9554)  time: 1.8922  data: 0.0003  max mem: 27896
+[03:02:21.416511] Epoch: [0]  [340/812]  lr: 0.000021  grad_norm: 0.3050 (0.5778)  closs: 0.8452 (0.9523)  time: 1.8902  data: 0.0002  max mem: 27896
+[03:02:40.411438] Epoch: [0]  [350/812]  lr: 0.000022  grad_norm: 0.2947 (0.5697)  closs: 0.8839 (0.9532)  time: 1.8910  data: 0.0002  max mem: 27896
+[03:02:59.253600] Epoch: [0]  [360/812]  lr: 0.000022  grad_norm: 0.2946 (0.5626)  closs: 0.9033 (0.9514)  time: 1.8918  data: 0.0002  max mem: 27896
+[03:03:18.214034] Epoch: [0]  [370/812]  lr: 0.000023  grad_norm: 0.2946 (0.5564)  closs: 0.8445 (0.9490)  time: 1.8901  data: 0.0002  max mem: 27896
+[03:03:37.033540] Epoch: [0]  [380/812]  lr: 0.000023  grad_norm: 0.2868 (0.5501)  closs: 0.8445 (0.9476)  time: 1.8889  data: 0.0002  max mem: 27896
+[03:03:55.989931] Epoch: [0]  [390/812]  lr: 0.000024  grad_norm: 0.2806 (0.5436)  closs: 0.8717 (0.9460)  time: 1.8887  data: 0.0002  max mem: 27896
+[03:04:14.812323] Epoch: [0]  [400/812]  lr: 0.000025  grad_norm: 0.2848 (0.5388)  closs: 0.8652 (0.9447)  time: 1.8889  data: 0.0002  max mem: 27896
+[03:04:33.756960] Epoch: [0]  [410/812]  lr: 0.000025  grad_norm: 0.2892 (0.5332)  closs: 0.8652 (0.9431)  time: 1.8883  data: 0.0002  max mem: 27896
+[03:04:52.642341] Epoch: [0]  [420/812]  lr: 0.000026  grad_norm: 0.2892 (0.5293)  closs: 0.8611 (0.9411)  time: 1.8914  data: 0.0002  max mem: 27896
+[03:05:11.582135] Epoch: [0]  [430/812]  lr: 0.000026  grad_norm: 0.3156 (0.5253)  closs: 0.8376 (0.9384)  time: 1.8912  data: 0.0002  max mem: 27896
+[03:05:30.444304] Epoch: [0]  [440/812]  lr: 0.000027  grad_norm: 0.3005 (0.5203)  closs: 0.8013 (0.9357)  time: 1.8900  data: 0.0002  max mem: 27896
+[03:05:49.417047] Epoch: [0]  [450/812]  lr: 0.000028  grad_norm: 0.2965 (0.5153)  closs: 0.8168 (0.9333)  time: 1.8917  data: 0.0002  max mem: 27896
+[03:06:08.266807] Epoch: [0]  [460/812]  lr: 0.000028  grad_norm: 0.2900 (0.5106)  closs: 0.8519 (0.9319)  time: 1.8910  data: 0.0002  max mem: 27896
+[03:06:27.237157] Epoch: [0]  [470/812]  lr: 0.000029  grad_norm: 0.2993 (0.5069)  closs: 0.8519 (0.9312)  time: 1.8909  data: 0.0002  max mem: 27896
+[03:06:46.077686] Epoch: [0]  [480/812]  lr: 0.000030  grad_norm: 0.3083 (0.5031)  closs: 0.9032 (0.9304)  time: 1.8905  data: 0.0002  max mem: 27896
+[03:07:05.047292] Epoch: [0]  [490/812]  lr: 0.000030  grad_norm: 0.2765 (0.4983)  closs: 0.8994 (0.9296)  time: 1.8904  data: 0.0002  max mem: 27896
+[03:07:23.915717] Epoch: [0]  [500/812]  lr: 0.000031  grad_norm: 0.2689 (0.4940)  closs: 0.8750 (0.9282)  time: 1.8918  data: 0.0002  max mem: 27896
+[03:07:42.875114] Epoch: [0]  [510/812]  lr: 0.000031  grad_norm: 0.2883 (0.4920)  closs: 0.8579 (0.9260)  time: 1.8913  data: 0.0002  max mem: 27896
+[03:08:01.733927] Epoch: [0]  [520/812]  lr: 0.000032  grad_norm: 0.3075 (0.4881)  closs: 0.7963 (0.9248)  time: 1.8908  data: 0.0002  max mem: 27896
+[03:08:20.563694] Epoch: [0]  [530/812]  lr: 0.000033  grad_norm: 0.3265 (0.4861)  closs: 0.8782 (0.9239)  time: 1.8844  data: 0.0002  max mem: 27896
+[03:08:39.514327] Epoch: [0]  [540/812]  lr: 0.000033  grad_norm: 0.3301 (0.4828)  closs: 0.9101 (0.9233)  time: 1.8890  data: 0.0002  max mem: 27896
+[03:08:58.358994] Epoch: [0]  [550/812]  lr: 0.000034  grad_norm: 0.3194 (0.4800)  closs: 0.8802 (0.9221)  time: 1.8897  data: 0.0002  max mem: 27896
+[03:09:17.294100] Epoch: [0]  [560/812]  lr: 0.000034  grad_norm: 0.3228 (0.4772)  closs: 0.8471 (0.9214)  time: 1.8889  data: 0.0002  max mem: 27896
+[03:09:36.134535] Epoch: [0]  [570/812]  lr: 0.000035  grad_norm: 0.2998 (0.4749)  closs: 0.8602 (0.9206)  time: 1.8887  data: 0.0002  max mem: 27896
+[03:09:55.071136] Epoch: [0]  [580/812]  lr: 0.000036  grad_norm: 0.2998 (0.4723)  closs: 0.8705 (0.9203)  time: 1.8888  data: 0.0002  max mem: 27896
+[03:10:13.960266] Epoch: [0]  [590/812]  lr: 0.000036  grad_norm: 0.3104 (0.4695)  closs: 0.8705 (0.9193)  time: 1.8912  data: 0.0002  max mem: 27896
+[03:10:32.941351] Epoch: [0]  [600/812]  lr: 0.000037  grad_norm: 0.2986 (0.4666)  closs: 0.8547 (0.9186)  time: 1.8934  data: 0.0002  max mem: 27896
+[03:10:51.868015] Epoch: [0]  [610/812]  lr: 0.000038  grad_norm: 0.3084 (0.4641)  closs: 0.8324 (0.9164)  time: 1.8953  data: 0.0002  max mem: 27896
+[03:11:10.823823] Epoch: [0]  [620/812]  lr: 0.000038  grad_norm: 0.3157 (0.4616)  closs: 0.7938 (0.9157)  time: 1.8941  data: 0.0002  max mem: 27896
+[03:11:29.685695] Epoch: [0]  [630/812]  lr: 0.000039  grad_norm: 0.3046 (0.4592)  closs: 0.8578 (0.9145)  time: 1.8908  data: 0.0002  max mem: 27896
+[03:11:48.663630] Epoch: [0]  [640/812]  lr: 0.000039  grad_norm: 0.3046 (0.4568)  closs: 0.8596 (0.9142)  time: 1.8919  data: 0.0002  max mem: 27896
+[03:12:07.520528] Epoch: [0]  [650/812]  lr: 0.000040  grad_norm: 0.3001 (0.4547)  closs: 0.8522 (0.9129)  time: 1.8917  data: 0.0002  max mem: 27896
+[03:12:26.490698] Epoch: [0]  [660/812]  lr: 0.000041  grad_norm: 0.2877 (0.4523)  closs: 0.8184 (0.9122)  time: 1.8913  data: 0.0002  max mem: 27896
+[03:12:45.398835] Epoch: [0]  [670/812]  lr: 0.000041  grad_norm: 0.2794 (0.4497)  closs: 0.8443 (0.9117)  time: 1.8938  data: 0.0002  max mem: 27896
+[03:13:04.374147] Epoch: [0]  [680/812]  lr: 0.000042  grad_norm: 0.2880 (0.4478)  closs: 0.8587 (0.9112)  time: 1.8941  data: 0.0002  max mem: 27896
+[03:13:23.286590] Epoch: [0]  [690/812]  lr: 0.000042  grad_norm: 0.3029 (0.4457)  closs: 0.8656 (0.9107)  time: 1.8943  data: 0.0002  max mem: 27896
+[03:13:42.230027] Epoch: [0]  [700/812]  lr: 0.000043  grad_norm: 0.3407 (0.4445)  closs: 0.8605 (0.9102)  time: 1.8927  data: 0.0002  max mem: 27896
+[03:14:01.080842] Epoch: [0]  [710/812]  lr: 0.000044  grad_norm: 0.3416 (0.4431)  closs: 0.8290 (0.9091)  time: 1.8896  data: 0.0002  max mem: 27896
+[03:14:20.026951] Epoch: [0]  [720/812]  lr: 0.000044  grad_norm: 0.3232 (0.4414)  closs: 0.8416 (0.9086)  time: 1.8898  data: 0.0002  max mem: 27896
+[03:14:38.859533] Epoch: [0]  [730/812]  lr: 0.000045  grad_norm: 0.3002 (0.4395)  closs: 0.8626 (0.9077)  time: 1.8889  data: 0.0002  max mem: 27896
+[03:14:57.748386] Epoch: [0]  [740/812]  lr: 0.000046  grad_norm: 0.2982 (0.4376)  closs: 0.8342 (0.9064)  time: 1.8860  data: 0.0002  max mem: 27896
+[03:15:16.711446] Epoch: [0]  [750/812]  lr: 0.000046  grad_norm: 0.3052 (0.4363)  closs: 0.8395 (0.9072)  time: 1.8925  data: 0.0002  max mem: 27896
+[03:15:35.570759] Epoch: [0]  [760/812]  lr: 0.000047  grad_norm: 0.3141 (0.4346)  closs: 0.8774 (0.9065)  time: 1.8910  data: 0.0002  max mem: 27896
+[03:15:54.542427] Epoch: [0]  [770/812]  lr: 0.000047  grad_norm: 0.3141 (0.4332)  closs: 0.8427 (0.9055)  time: 1.8915  data: 0.0002  max mem: 27896
+[03:16:13.384792] Epoch: [0]  [780/812]  lr: 0.000048  grad_norm: 0.3178 (0.4316)  closs: 0.8427 (0.9050)  time: 1.8906  data: 0.0002  max mem: 27896
+[03:16:32.365361] Epoch: [0]  [790/812]  lr: 0.000049  grad_norm: 0.2918 (0.4301)  closs: 0.8414 (0.9041)  time: 1.8911  data: 0.0002  max mem: 27896
+[03:16:51.471742] Epoch: [0]  [800/812]  lr: 0.000049  grad_norm: 0.2894 (0.4286)  closs: 0.8447 (0.9036)  time: 1.9043  data: 0.0002  max mem: 27896
+[03:17:10.470921] Epoch: [0]  [810/812]  lr: 0.000050  grad_norm: 0.2894 (0.4278)  closs: 0.8293 (0.9023)  time: 1.9052  data: 0.0001  max mem: 27896
+[03:17:12.550725] Epoch: [0] Total time: 0:25:40
+[03:17:12.557705] Averaged stats: lr: 0.000050  grad_norm: 0.2894 (0.4276)  closs: 0.8217 (0.9074)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[03:17:12.633835] model saved
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[03:17:13.416849] optimizer saved
+[03:17:13.417272] other rank-common saved
+[03:17:13.419172] rank-specific saved
+[03:17:13.424182] log_dir: ./output_dir
+[03:17:16.591962] Epoch: [1]  [0/812]  lr: 0.000050  grad_norm: 0.3125 (0.3125)  closs: 0.6480 (0.6480)  time: 3.1669  data: 1.2401  max mem: 27896
+[03:17:35.447743] Epoch: [1]  [10/812]  lr: 0.000050  grad_norm: 0.3209 (0.3396)  closs: 0.8064 (0.7937)  time: 2.0020  data: 0.1129  max mem: 27896
+[03:17:54.386301] Epoch: [1]  [20/812]  lr: 0.000050  grad_norm: 0.3280 (0.3354)  closs: 0.8198 (0.8550)  time: 1.8896  data: 0.0002  max mem: 27896
+[03:18:13.203946] Epoch: [1]  [30/812]  lr: 0.000050  grad_norm: 0.3237 (0.3326)  closs: 0.8367 (0.8505)  time: 1.8877  data: 0.0002  max mem: 27896
+[03:18:32.059763] Epoch: [1]  [40/812]  lr: 0.000050  grad_norm: 0.3117 (0.3285)  closs: 0.8744 (0.8705)  time: 1.8836  data: 0.0002  max mem: 27896
+[03:18:50.851265] Epoch: [1]  [50/812]  lr: 0.000050  grad_norm: 0.3022 (0.3242)  closs: 0.8867 (0.8662)  time: 1.8823  data: 0.0002  max mem: 27896
+[03:19:09.770410] Epoch: [1]  [60/812]  lr: 0.000050  grad_norm: 0.3107 (0.3216)  closs: 0.8414 (0.8637)  time: 1.8855  data: 0.0002  max mem: 27896
+[03:19:28.603865] Epoch: [1]  [70/812]  lr: 0.000050  grad_norm: 0.3107 (0.3218)  closs: 0.8715 (0.8666)  time: 1.8876  data: 0.0002  max mem: 27896
+[03:19:47.568062] Epoch: [1]  [80/812]  lr: 0.000050  grad_norm: 0.3189 (0.3245)  closs: 0.8715 (0.8625)  time: 1.8898  data: 0.0002  max mem: 27896
+[03:20:06.507893] Epoch: [1]  [90/812]  lr: 0.000050  grad_norm: 0.2924 (0.3215)  closs: 0.8712 (0.8666)  time: 1.8951  data: 0.0001  max mem: 27896
+[03:20:25.438363] Epoch: [1]  [100/812]  lr: 0.000050  grad_norm: 0.2860 (0.3200)  closs: 0.8985 (0.8687)  time: 1.8934  data: 0.0002  max mem: 27896
+[03:20:44.375178] Epoch: [1]  [110/812]  lr: 0.000050  grad_norm: 0.3010 (0.3197)  closs: 0.8714 (0.8658)  time: 1.8933  data: 0.0002  max mem: 27896
+[03:21:03.380453] Epoch: [1]  [120/812]  lr: 0.000050  grad_norm: 0.3176 (0.3210)  closs: 0.8714 (0.8665)  time: 1.8970  data: 0.0001  max mem: 27896
+[03:21:22.330716] Epoch: [1]  [130/812]  lr: 0.000050  grad_norm: 0.3235 (0.3209)  closs: 0.8674 (0.8666)  time: 1.8977  data: 0.0001  max mem: 27896
+[03:21:41.288988] Epoch: [1]  [140/812]  lr: 0.000050  grad_norm: 0.3232 (0.3216)  closs: 0.8588 (0.8663)  time: 1.8954  data: 0.0002  max mem: 27896
+[03:22:00.301536] Epoch: [1]  [150/812]  lr: 0.000050  grad_norm: 0.2895 (0.3203)  closs: 0.8738 (0.8659)  time: 1.8985  data: 0.0002  max mem: 27896
+[03:22:19.237615] Epoch: [1]  [160/812]  lr: 0.000050  grad_norm: 0.2951 (0.3205)  closs: 0.8738 (0.8684)  time: 1.8974  data: 0.0001  max mem: 27896
+[03:22:38.171105] Epoch: [1]  [170/812]  lr: 0.000049  grad_norm: 0.3091 (0.3205)  closs: 0.9185 (0.8708)  time: 1.8934  data: 0.0001  max mem: 27896
+[03:22:57.187937] Epoch: [1]  [180/812]  lr: 0.000049  grad_norm: 0.3005 (0.3198)  closs: 0.8777 (0.8709)  time: 1.8975  data: 0.0001  max mem: 27896
+[03:23:16.138555] Epoch: [1]  [190/812]  lr: 0.000049  grad_norm: 0.2864 (0.3175)  closs: 0.8148 (0.8694)  time: 1.8983  data: 0.0002  max mem: 27896
+[03:23:35.201639] Epoch: [1]  [200/812]  lr: 0.000049  grad_norm: 0.2781 (0.3173)  closs: 0.8484 (0.8675)  time: 1.9006  data: 0.0002  max mem: 27896
+[03:23:54.136076] Epoch: [1]  [210/812]  lr: 0.000049  grad_norm: 0.2996 (0.3176)  closs: 0.8737 (0.8697)  time: 1.8998  data: 0.0002  max mem: 27896
+[03:24:13.166403] Epoch: [1]  [220/812]  lr: 0.000049  grad_norm: 0.3093 (0.3172)  closs: 0.8767 (0.8683)  time: 1.8982  data: 0.0001  max mem: 27896
+[03:24:32.133003] Epoch: [1]  [230/812]  lr: 0.000049  grad_norm: 0.2879 (0.3159)  closs: 0.8170 (0.8667)  time: 1.8998  data: 0.0001  max mem: 27896
+[03:24:51.228672] Epoch: [1]  [240/812]  lr: 0.000049  grad_norm: 0.2825 (0.3146)  closs: 0.8517 (0.8678)  time: 1.9030  data: 0.0002  max mem: 27896
+[03:25:10.186686] Epoch: [1]  [250/812]  lr: 0.000049  grad_norm: 0.3003 (0.3158)  closs: 0.8893 (0.8669)  time: 1.9026  data: 0.0002  max mem: 27896
+[03:25:29.165652] Epoch: [1]  [260/812]  lr: 0.000049  grad_norm: 0.3473 (0.3173)  closs: 0.8900 (0.8674)  time: 1.8968  data: 0.0002  max mem: 27896
+[03:25:48.150931] Epoch: [1]  [270/812]  lr: 0.000049  grad_norm: 0.2909 (0.3169)  closs: 0.8733 (0.8661)  time: 1.8981  data: 0.0002  max mem: 27896
+[03:26:07.074547] Epoch: [1]  [280/812]  lr: 0.000049  grad_norm: 0.2983 (0.3172)  closs: 0.8733 (0.8664)  time: 1.8954  data: 0.0001  max mem: 27896
+[03:26:26.101288] Epoch: [1]  [290/812]  lr: 0.000048  grad_norm: 0.2983 (0.3173)  closs: 0.8555 (0.8667)  time: 1.8974  data: 0.0002  max mem: 27896
+[03:26:45.048249] Epoch: [1]  [300/812]  lr: 0.000048  grad_norm: 0.3011 (0.3168)  closs: 0.8480 (0.8656)  time: 1.8986  data: 0.0002  max mem: 27896
+[03:27:04.100352] Epoch: [1]  [310/812]  lr: 0.000048  grad_norm: 0.3140 (0.3176)  closs: 0.8480 (0.8648)  time: 1.8999  data: 0.0002  max mem: 27896
+[03:27:23.049115] Epoch: [1]  [320/812]  lr: 0.000048  grad_norm: 0.3152 (0.3181)  closs: 0.8513 (0.8656)  time: 1.9000  data: 0.0002  max mem: 27896
+[03:27:42.072420] Epoch: [1]  [330/812]  lr: 0.000048  grad_norm: 0.2992 (0.3181)  closs: 0.8314 (0.8651)  time: 1.8985  data: 0.0002  max mem: 27896
+[03:28:01.013759] Epoch: [1]  [340/812]  lr: 0.000048  grad_norm: 0.3095 (0.3191)  closs: 0.8002 (0.8631)  time: 1.8982  data: 0.0002  max mem: 27896
+[03:28:19.984896] Epoch: [1]  [350/812]  lr: 0.000048  grad_norm: 0.3020 (0.3184)  closs: 0.8002 (0.8626)  time: 1.8955  data: 0.0002  max mem: 27896
+[03:28:39.032148] Epoch: [1]  [360/812]  lr: 0.000048  grad_norm: 0.2855 (0.3181)  closs: 0.7816 (0.8603)  time: 1.9009  data: 0.0002  max mem: 27896
+[03:28:57.981034] Epoch: [1]  [370/812]  lr: 0.000047  grad_norm: 0.3070 (0.3185)  closs: 0.7695 (0.8594)  time: 1.8997  data: 0.0001  max mem: 27896
+[03:29:16.955211] Epoch: [1]  [380/812]  lr: 0.000047  grad_norm: 0.2980 (0.3177)  closs: 0.8399 (0.8595)  time: 1.8961  data: 0.0001  max mem: 27896
+[03:29:36.033549] Epoch: [1]  [390/812]  lr: 0.000047  grad_norm: 0.2980 (0.3188)  closs: 0.8620 (0.8597)  time: 1.9026  data: 0.0002  max mem: 27896
+[03:29:54.991316] Epoch: [1]  [400/812]  lr: 0.000047  grad_norm: 0.2939 (0.3181)  closs: 0.8539 (0.8596)  time: 1.9017  data: 0.0002  max mem: 27896
+[03:30:14.020931] Epoch: [1]  [410/812]  lr: 0.000047  grad_norm: 0.2939 (0.3179)  closs: 0.8723 (0.8610)  time: 1.8993  data: 0.0002  max mem: 27896
+[03:30:32.994084] Epoch: [1]  [420/812]  lr: 0.000047  grad_norm: 0.2965 (0.3175)  closs: 0.9123 (0.8614)  time: 1.9001  data: 0.0002  max mem: 27896
+[03:30:52.010141] Epoch: [1]  [430/812]  lr: 0.000047  grad_norm: 0.3213 (0.3190)  closs: 0.8477 (0.8606)  time: 1.8994  data: 0.0002  max mem: 27896
+[03:31:10.982488] Epoch: [1]  [440/812]  lr: 0.000046  grad_norm: 0.3319 (0.3193)  closs: 0.8510 (0.8606)  time: 1.8994  data: 0.0002  max mem: 27896
+[03:31:30.070904] Epoch: [1]  [450/812]  lr: 0.000046  grad_norm: 0.3319 (0.3200)  closs: 0.8574 (0.8601)  time: 1.9030  data: 0.0002  max mem: 27896
+[03:31:48.984566] Epoch: [1]  [460/812]  lr: 0.000046  grad_norm: 0.3272 (0.3199)  closs: 0.8117 (0.8596)  time: 1.9000  data: 0.0002  max mem: 27896
+[03:32:07.916033] Epoch: [1]  [470/812]  lr: 0.000046  grad_norm: 0.3272 (0.3203)  closs: 0.8229 (0.8597)  time: 1.8922  data: 0.0002  max mem: 27896
+[03:32:26.962961] Epoch: [1]  [480/812]  lr: 0.000046  grad_norm: 0.3430 (0.3215)  closs: 0.8518 (0.8603)  time: 1.8988  data: 0.0002  max mem: 27896
+[03:32:45.959510] Epoch: [1]  [490/812]  lr: 0.000046  grad_norm: 0.3272 (0.3215)  closs: 0.8580 (0.8609)  time: 1.9021  data: 0.0002  max mem: 27896
+[03:33:05.027682] Epoch: [1]  [500/812]  lr: 0.000045  grad_norm: 0.3090 (0.3216)  closs: 0.8381 (0.8604)  time: 1.9032  data: 0.0002  max mem: 27896
+[03:33:23.971362] Epoch: [1]  [510/812]  lr: 0.000045  grad_norm: 0.2950 (0.3209)  closs: 0.8181 (0.8601)  time: 1.9005  data: 0.0002  max mem: 27896
+[03:33:43.025814] Epoch: [1]  [520/812]  lr: 0.000045  grad_norm: 0.2972 (0.3213)  closs: 0.8379 (0.8600)  time: 1.8998  data: 0.0002  max mem: 27896
+[03:34:02.005637] Epoch: [1]  [530/812]  lr: 0.000045  grad_norm: 0.2975 (0.3208)  closs: 0.8449 (0.8601)  time: 1.9016  data: 0.0002  max mem: 27896
+[03:34:21.081675] Epoch: [1]  [540/812]  lr: 0.000045  grad_norm: 0.2917 (0.3201)  closs: 0.8353 (0.8602)  time: 1.9027  data: 0.0002  max mem: 27896
+[03:34:40.078513] Epoch: [1]  [550/812]  lr: 0.000045  grad_norm: 0.2818 (0.3200)  closs: 0.8204 (0.8595)  time: 1.9036  data: 0.0002  max mem: 27896
+[03:34:59.061182] Epoch: [1]  [560/812]  lr: 0.000044  grad_norm: 0.2818 (0.3203)  closs: 0.8368 (0.8594)  time: 1.8989  data: 0.0002  max mem: 27896
+[03:35:18.076387] Epoch: [1]  [570/812]  lr: 0.000044  grad_norm: 0.2940 (0.3200)  closs: 0.8546 (0.8598)  time: 1.8998  data: 0.0002  max mem: 27896
+[03:35:37.024489] Epoch: [1]  [580/812]  lr: 0.000044  grad_norm: 0.2995 (0.3199)  closs: 0.8630 (0.8592)  time: 1.8981  data: 0.0002  max mem: 27896
+[03:35:55.936245] Epoch: [1]  [590/812]  lr: 0.000044  grad_norm: 0.3004 (0.3198)  closs: 0.8705 (0.8594)  time: 1.8929  data: 0.0001  max mem: 27896
+[03:36:14.971334] Epoch: [1]  [600/812]  lr: 0.000044  grad_norm: 0.2997 (0.3196)  closs: 0.8623 (0.8597)  time: 1.8973  data: 0.0001  max mem: 27896
+[03:36:33.918017] Epoch: [1]  [610/812]  lr: 0.000043  grad_norm: 0.2937 (0.3191)  closs: 0.8426 (0.8597)  time: 1.8990  data: 0.0002  max mem: 27896
+[03:36:52.988382] Epoch: [1]  [620/812]  lr: 0.000043  grad_norm: 0.2937 (0.3188)  closs: 0.8482 (0.8598)  time: 1.9008  data: 0.0002  max mem: 27896
+[03:37:11.918731] Epoch: [1]  [630/812]  lr: 0.000043  grad_norm: 0.2877 (0.3183)  closs: 0.8632 (0.8602)  time: 1.9000  data: 0.0002  max mem: 27896
+[03:37:30.972461] Epoch: [1]  [640/812]  lr: 0.000043  grad_norm: 0.2992 (0.3184)  closs: 0.8567 (0.8603)  time: 1.8991  data: 0.0002  max mem: 27896
+[03:37:49.926451] Epoch: [1]  [650/812]  lr: 0.000043  grad_norm: 0.3033 (0.3182)  closs: 0.8303 (0.8600)  time: 1.9003  data: 0.0002  max mem: 27896
+[03:38:09.012642] Epoch: [1]  [660/812]  lr: 0.000042  grad_norm: 0.2824 (0.3180)  closs: 0.8863 (0.8610)  time: 1.9019  data: 0.0002  max mem: 27896
+[03:38:27.968324] Epoch: [1]  [670/812]  lr: 0.000042  grad_norm: 0.3086 (0.3179)  closs: 0.9184 (0.8616)  time: 1.9020  data: 0.0002  max mem: 27896
+[03:38:46.902521] Epoch: [1]  [680/812]  lr: 0.000042  grad_norm: 0.3148 (0.3180)  closs: 0.9153 (0.8622)  time: 1.8944  data: 0.0002  max mem: 27896
+[03:39:05.936878] Epoch: [1]  [690/812]  lr: 0.000042  grad_norm: 0.3135 (0.3184)  closs: 0.9215 (0.8627)  time: 1.8984  data: 0.0002  max mem: 27896
+[03:39:24.940302] Epoch: [1]  [700/812]  lr: 0.000041  grad_norm: 0.3135 (0.3184)  closs: 0.8515 (0.8626)  time: 1.9018  data: 0.0002  max mem: 27896
+[03:39:44.005071] Epoch: [1]  [710/812]  lr: 0.000041  grad_norm: 0.2938 (0.3180)  closs: 0.8168 (0.8624)  time: 1.9033  data: 0.0002  max mem: 27896
+[03:40:02.939301] Epoch: [1]  [720/812]  lr: 0.000041  grad_norm: 0.2826 (0.3177)  closs: 0.8161 (0.8619)  time: 1.8999  data: 0.0002  max mem: 27896
+[03:40:21.995546] Epoch: [1]  [730/812]  lr: 0.000041  grad_norm: 0.2945 (0.3175)  closs: 0.8535 (0.8620)  time: 1.8994  data: 0.0003  max mem: 27896
+[03:40:40.932897] Epoch: [1]  [740/812]  lr: 0.000041  grad_norm: 0.2991 (0.3176)  closs: 0.8593 (0.8619)  time: 1.8996  data: 0.0003  max mem: 27896
+[03:40:59.951242] Epoch: [1]  [750/812]  lr: 0.000040  grad_norm: 0.3080 (0.3176)  closs: 0.8315 (0.8617)  time: 1.8977  data: 0.0002  max mem: 27896
+[03:41:18.894508] Epoch: [1]  [760/812]  lr: 0.000040  grad_norm: 0.3038 (0.3174)  closs: 0.8399 (0.8620)  time: 1.8980  data: 0.0002  max mem: 27896
+[03:41:37.827976] Epoch: [1]  [770/812]  lr: 0.000040  grad_norm: 0.2959 (0.3172)  closs: 0.8679 (0.8615)  time: 1.8938  data: 0.0002  max mem: 27896
+[03:41:56.837040] Epoch: [1]  [780/812]  lr: 0.000040  grad_norm: 0.3130 (0.3175)  closs: 0.8068 (0.8613)  time: 1.8971  data: 0.0002  max mem: 27896
+[03:42:15.740854] Epoch: [1]  [790/812]  lr: 0.000039  grad_norm: 0.3148 (0.3176)  closs: 0.8560 (0.8615)  time: 1.8956  data: 0.0001  max mem: 27896
+[03:42:34.640288] Epoch: [1]  [800/812]  lr: 0.000039  grad_norm: 0.2908 (0.3177)  closs: 0.8465 (0.8611)  time: 1.8901  data: 0.0001  max mem: 27896
+[03:42:53.636957] Epoch: [1]  [810/812]  lr: 0.000039  grad_norm: 0.3240 (0.3181)  closs: 0.8465 (0.8611)  time: 1.8947  data: 0.0001  max mem: 27896
+[03:42:55.771594] Epoch: [1] Total time: 0:25:42
+[03:42:55.780429] Averaged stats: lr: 0.000039  grad_norm: 0.3240 (0.3180)  closs: 0.8698 (0.8587)
+[03:42:55.860690] model saved
+[03:42:56.725243] optimizer saved
+[03:42:56.725711] other rank-common saved
+[03:42:56.727577] rank-specific saved
+[03:42:56.732685] log_dir: ./output_dir
+[03:42:59.943195] Epoch: [2]  [0/812]  lr: 0.000039  grad_norm: 0.2991 (0.2991)  closs: 0.8053 (0.8053)  time: 3.2096  data: 1.2947  max mem: 27896
+[03:43:18.767681] Epoch: [2]  [10/812]  lr: 0.000038  grad_norm: 0.2981 (0.3024)  closs: 0.8402 (0.8487)  time: 2.0030  data: 0.1178  max mem: 27896
+[03:43:37.559302] Epoch: [2]  [20/812]  lr: 0.000038  grad_norm: 0.2907 (0.2977)  closs: 0.8531 (0.8402)  time: 1.8807  data: 0.0002  max mem: 27896
+[03:43:56.328664] Epoch: [2]  [30/812]  lr: 0.000038  grad_norm: 0.2892 (0.2990)  closs: 0.8531 (0.8497)  time: 1.8780  data: 0.0002  max mem: 27896
+[03:44:15.360720] Epoch: [2]  [40/812]  lr: 0.000038  grad_norm: 0.3005 (0.3036)  closs: 0.8428 (0.8486)  time: 1.8900  data: 0.0002  max mem: 27896
+[03:44:34.223213] Epoch: [2]  [50/812]  lr: 0.000037  grad_norm: 0.3022 (0.3076)  closs: 0.8627 (0.8444)  time: 1.8946  data: 0.0002  max mem: 27896
+[03:44:53.324545] Epoch: [2]  [60/812]  lr: 0.000037  grad_norm: 0.3037 (0.3084)  closs: 0.8664 (0.8472)  time: 1.8981  data: 0.0002  max mem: 27896
+[03:45:12.223238] Epoch: [2]  [70/812]  lr: 0.000037  grad_norm: 0.2974 (0.3133)  closs: 0.8676 (0.8517)  time: 1.8999  data: 0.0002  max mem: 27896
+[03:45:31.177084] Epoch: [2]  [80/812]  lr: 0.000037  grad_norm: 0.2974 (0.3155)  closs: 0.8256 (0.8500)  time: 1.8926  data: 0.0002  max mem: 27896
+[03:45:50.121054] Epoch: [2]  [90/812]  lr: 0.000036  grad_norm: 0.2943 (0.3140)  closs: 0.8076 (0.8475)  time: 1.8948  data: 0.0002  max mem: 27896
+[03:46:09.082639] Epoch: [2]  [100/812]  lr: 0.000036  grad_norm: 0.2897 (0.3147)  closs: 0.8119 (0.8487)  time: 1.8952  data: 0.0002  max mem: 27896
+[03:46:28.081000] Epoch: [2]  [110/812]  lr: 0.000036  grad_norm: 0.3119 (0.3176)  closs: 0.8299 (0.8439)  time: 1.8979  data: 0.0002  max mem: 27896
+[03:46:47.036342] Epoch: [2]  [120/812]  lr: 0.000036  grad_norm: 0.3081 (0.3179)  closs: 0.8515 (0.8454)  time: 1.8976  data: 0.0002  max mem: 27896
+[03:47:05.993187] Epoch: [2]  [130/812]  lr: 0.000035  grad_norm: 0.2996 (0.3170)  closs: 0.8354 (0.8443)  time: 1.8955  data: 0.0002  max mem: 27896
+[03:47:24.929514] Epoch: [2]  [140/812]  lr: 0.000035  grad_norm: 0.3093 (0.3183)  closs: 0.8190 (0.8438)  time: 1.8946  data: 0.0002  max mem: 27896
+[03:47:43.876677] Epoch: [2]  [150/812]  lr: 0.000035  grad_norm: 0.3143 (0.3207)  closs: 0.8561 (0.8450)  time: 1.8941  data: 0.0002  max mem: 27896
+[03:48:02.795947] Epoch: [2]  [160/812]  lr: 0.000035  grad_norm: 0.3143 (0.3201)  closs: 0.8929 (0.8460)  time: 1.8933  data: 0.0002  max mem: 27896
+[03:48:21.796307] Epoch: [2]  [170/812]  lr: 0.000034  grad_norm: 0.3005 (0.3200)  closs: 0.8243 (0.8450)  time: 1.8959  data: 0.0002  max mem: 27896
+[03:48:40.746031] Epoch: [2]  [180/812]  lr: 0.000034  grad_norm: 0.3005 (0.3187)  closs: 0.8145 (0.8446)  time: 1.8974  data: 0.0002  max mem: 27896
+[03:48:59.708301] Epoch: [2]  [190/812]  lr: 0.000034  grad_norm: 0.2997 (0.3187)  closs: 0.8326 (0.8457)  time: 1.8955  data: 0.0002  max mem: 27896
+[03:49:18.559350] Epoch: [2]  [200/812]  lr: 0.000033  grad_norm: 0.2997 (0.3192)  closs: 0.8350 (0.8467)  time: 1.8906  data: 0.0002  max mem: 27896
+[03:49:37.630429] Epoch: [2]  [210/812]  lr: 0.000033  grad_norm: 0.3154 (0.3202)  closs: 0.8350 (0.8471)  time: 1.8960  data: 0.0002  max mem: 27896
+[03:49:56.492380] Epoch: [2]  [220/812]  lr: 0.000033  grad_norm: 0.3290 (0.3208)  closs: 0.8725 (0.8488)  time: 1.8966  data: 0.0002  max mem: 27896
+[03:50:15.471718] Epoch: [2]  [230/812]  lr: 0.000033  grad_norm: 0.3217 (0.3205)  closs: 0.8615 (0.8494)  time: 1.8920  data: 0.0002  max mem: 27896
+[03:50:34.336505] Epoch: [2]  [240/812]  lr: 0.000032  grad_norm: 0.2963 (0.3207)  closs: 0.8458 (0.8492)  time: 1.8921  data: 0.0002  max mem: 27896
+[03:50:53.385960] Epoch: [2]  [250/812]  lr: 0.000032  grad_norm: 0.2963 (0.3202)  closs: 0.8483 (0.8498)  time: 1.8956  data: 0.0002  max mem: 27896
+[03:51:12.244932] Epoch: [2]  [260/812]  lr: 0.000032  grad_norm: 0.3020 (0.3205)  closs: 0.8005 (0.8491)  time: 1.8953  data: 0.0002  max mem: 27896
+[03:51:31.317945] Epoch: [2]  [270/812]  lr: 0.000031  grad_norm: 0.2977 (0.3204)  closs: 0.8133 (0.8502)  time: 1.8965  data: 0.0002  max mem: 27896
+[03:51:50.166003] Epoch: [2]  [280/812]  lr: 0.000031  grad_norm: 0.2890 (0.3196)  closs: 0.8392 (0.8516)  time: 1.8960  data: 0.0002  max mem: 27896
+[03:52:09.099328] Epoch: [2]  [290/812]  lr: 0.000031  grad_norm: 0.3024 (0.3196)  closs: 0.8183 (0.8484)  time: 1.8890  data: 0.0002  max mem: 27896
+[03:52:28.076001] Epoch: [2]  [300/812]  lr: 0.000031  grad_norm: 0.2902 (0.3187)  closs: 0.8146 (0.8481)  time: 1.8954  data: 0.0002  max mem: 27896
+[03:52:47.002133] Epoch: [2]  [310/812]  lr: 0.000030  grad_norm: 0.3055 (0.3195)  closs: 0.8441 (0.8482)  time: 1.8951  data: 0.0002  max mem: 27896
+[03:53:05.972836] Epoch: [2]  [320/812]  lr: 0.000030  grad_norm: 0.3280 (0.3197)  closs: 0.8456 (0.8494)  time: 1.8948  data: 0.0002  max mem: 27896
+[03:53:24.927527] Epoch: [2]  [330/812]  lr: 0.000030  grad_norm: 0.3214 (0.3210)  closs: 0.8868 (0.8509)  time: 1.8962  data: 0.0002  max mem: 27896
+[03:53:43.909637] Epoch: [2]  [340/812]  lr: 0.000029  grad_norm: 0.3140 (0.3209)  closs: 0.9016 (0.8518)  time: 1.8968  data: 0.0002  max mem: 27896
+[03:54:02.886469] Epoch: [2]  [350/812]  lr: 0.000029  grad_norm: 0.3002 (0.3203)  closs: 0.8522 (0.8515)  time: 1.8979  data: 0.0002  max mem: 27896
+[03:54:21.844980] Epoch: [2]  [360/812]  lr: 0.000029  grad_norm: 0.2893 (0.3196)  closs: 0.8485 (0.8523)  time: 1.8967  data: 0.0002  max mem: 27896
+[03:54:40.781893] Epoch: [2]  [370/812]  lr: 0.000029  grad_norm: 0.3001 (0.3192)  closs: 0.8485 (0.8522)  time: 1.8947  data: 0.0002  max mem: 27896
+[03:54:59.740019] Epoch: [2]  [380/812]  lr: 0.000028  grad_norm: 0.3056 (0.3227)  closs: 0.8477 (0.8514)  time: 1.8947  data: 0.0002  max mem: 27896
+[03:55:18.678795] Epoch: [2]  [390/812]  lr: 0.000028  grad_norm: 0.3007 (0.3224)  closs: 0.8237 (0.8500)  time: 1.8948  data: 0.0002  max mem: 27896
+[03:55:37.600494] Epoch: [2]  [400/812]  lr: 0.000028  grad_norm: 0.3007 (0.3219)  closs: 0.8250 (0.8501)  time: 1.8930  data: 0.0002  max mem: 27896
+[03:55:56.472663] Epoch: [2]  [410/812]  lr: 0.000027  grad_norm: 0.3009 (0.3216)  closs: 0.8250 (0.8499)  time: 1.8896  data: 0.0002  max mem: 27896
+[03:56:15.500175] Epoch: [2]  [420/812]  lr: 0.000027  grad_norm: 0.3128 (0.3219)  closs: 0.8157 (0.8500)  time: 1.8949  data: 0.0003  max mem: 27896
+[03:56:34.380680] Epoch: [2]  [430/812]  lr: 0.000027  grad_norm: 0.3113 (0.3214)  closs: 0.8179 (0.8505)  time: 1.8953  data: 0.0004  max mem: 27896
+[03:56:53.398340] Epoch: [2]  [440/812]  lr: 0.000027  grad_norm: 0.3011 (0.3245)  closs: 0.8574 (0.8503)  time: 1.8948  data: 0.0002  max mem: 27896
+[03:57:12.203346] Epoch: [2]  [450/812]  lr: 0.000026  grad_norm: 0.3066 (0.3242)  closs: 0.8575 (0.8514)  time: 1.8911  data: 0.0002  max mem: 27896
+[03:57:31.281200] Epoch: [2]  [460/812]  lr: 0.000026  grad_norm: 0.3004 (0.3246)  closs: 0.9145 (0.8534)  time: 1.8941  data: 0.0001  max mem: 27896
+[03:57:50.139462] Epoch: [2]  [470/812]  lr: 0.000026  grad_norm: 0.3004 (0.3239)  closs: 0.8963 (0.8530)  time: 1.8967  data: 0.0001  max mem: 27896
+[03:58:09.179436] Epoch: [2]  [480/812]  lr: 0.000025  grad_norm: 0.2824 (0.3238)  closs: 0.8761 (0.8542)  time: 1.8948  data: 0.0002  max mem: 27896
+[03:58:28.025942] Epoch: [2]  [490/812]  lr: 0.000025  grad_norm: 0.2953 (0.3236)  closs: 0.8482 (0.8534)  time: 1.8943  data: 0.0002  max mem: 27896
+[03:58:46.950908] Epoch: [2]  [500/812]  lr: 0.000025  grad_norm: 0.2953 (0.3233)  closs: 0.8324 (0.8530)  time: 1.8885  data: 0.0002  max mem: 27896
+[03:59:05.900493] Epoch: [2]  [510/812]  lr: 0.000024  grad_norm: 0.2942 (0.3227)  closs: 0.8472 (0.8531)  time: 1.8937  data: 0.0002  max mem: 27896
+[03:59:24.850132] Epoch: [2]  [520/812]  lr: 0.000024  grad_norm: 0.2936 (0.3224)  closs: 0.8472 (0.8531)  time: 1.8949  data: 0.0002  max mem: 27896
+[03:59:43.822183] Epoch: [2]  [530/812]  lr: 0.000024  grad_norm: 0.3021 (0.3224)  closs: 0.8391 (0.8533)  time: 1.8960  data: 0.0002  max mem: 27896
+[04:00:02.715570] Epoch: [2]  [540/812]  lr: 0.000024  grad_norm: 0.3103 (0.3226)  closs: 0.8391 (0.8531)  time: 1.8932  data: 0.0002  max mem: 27896
+[04:00:21.684010] Epoch: [2]  [550/812]  lr: 0.000023  grad_norm: 0.3008 (0.3223)  closs: 0.8504 (0.8529)  time: 1.8930  data: 0.0002  max mem: 27896
+[04:00:40.626087] Epoch: [2]  [560/812]  lr: 0.000023  grad_norm: 0.2991 (0.3220)  closs: 0.8343 (0.8530)  time: 1.8955  data: 0.0002  max mem: 27896
+[04:00:59.536283] Epoch: [2]  [570/812]  lr: 0.000023  grad_norm: 0.3049 (0.3218)  closs: 0.8319 (0.8523)  time: 1.8925  data: 0.0002  max mem: 27896
+[04:01:18.455498] Epoch: [2]  [580/812]  lr: 0.000022  grad_norm: 0.2932 (0.3219)  closs: 0.8129 (0.8513)  time: 1.8914  data: 0.0002  max mem: 27896
+[04:01:37.396965] Epoch: [2]  [590/812]  lr: 0.000022  grad_norm: 0.2951 (0.3221)  closs: 0.8361 (0.8503)  time: 1.8930  data: 0.0002  max mem: 27896
+[04:01:56.333263] Epoch: [2]  [600/812]  lr: 0.000022  grad_norm: 0.2951 (0.3216)  closs: 0.8458 (0.8503)  time: 1.8938  data: 0.0002  max mem: 27896
+[04:02:15.228857] Epoch: [2]  [610/812]  lr: 0.000022  grad_norm: 0.2950 (0.3215)  closs: 0.8371 (0.8502)  time: 1.8915  data: 0.0002  max mem: 27896
+[04:02:34.062409] Epoch: [2]  [620/812]  lr: 0.000021  grad_norm: 0.3077 (0.3212)  closs: 0.8371 (0.8501)  time: 1.8864  data: 0.0002  max mem: 27896
+[04:02:53.092759] Epoch: [2]  [630/812]  lr: 0.000021  grad_norm: 0.2967 (0.3209)  closs: 0.8287 (0.8500)  time: 1.8931  data: 0.0002  max mem: 27896
+[04:03:11.911605] Epoch: [2]  [640/812]  lr: 0.000021  grad_norm: 0.2954 (0.3207)  closs: 0.8199 (0.8500)  time: 1.8924  data: 0.0002  max mem: 27896
+[04:03:30.948186] Epoch: [2]  [650/812]  lr: 0.000021  grad_norm: 0.3093 (0.3212)  closs: 0.7838 (0.8485)  time: 1.8927  data: 0.0002  max mem: 27896
+[04:03:49.838472] Epoch: [2]  [660/812]  lr: 0.000020  grad_norm: 0.3031 (0.3209)  closs: 0.7751 (0.8479)  time: 1.8963  data: 0.0002  max mem: 27896
+[04:04:08.869671] Epoch: [2]  [670/812]  lr: 0.000020  grad_norm: 0.2833 (0.3208)  closs: 0.7997 (0.8473)  time: 1.8960  data: 0.0002  max mem: 27896
+[04:04:27.733943] Epoch: [2]  [680/812]  lr: 0.000020  grad_norm: 0.2833 (0.3206)  closs: 0.8524 (0.8480)  time: 1.8947  data: 0.0002  max mem: 27896
+[04:04:46.759941] Epoch: [2]  [690/812]  lr: 0.000019  grad_norm: 0.2948 (0.3202)  closs: 0.8962 (0.8483)  time: 1.8944  data: 0.0002  max mem: 27896
+[04:05:05.600006] Epoch: [2]  [700/812]  lr: 0.000019  grad_norm: 0.3046 (0.3202)  closs: 0.8425 (0.8481)  time: 1.8932  data: 0.0002  max mem: 27896
+[04:05:24.547147] Epoch: [2]  [710/812]  lr: 0.000019  grad_norm: 0.3177 (0.3202)  closs: 0.8007 (0.8475)  time: 1.8893  data: 0.0002  max mem: 27896
+[04:05:43.510683] Epoch: [2]  [720/812]  lr: 0.000019  grad_norm: 0.3048 (0.3202)  closs: 0.8007 (0.8476)  time: 1.8955  data: 0.0002  max mem: 27896
+[04:06:02.433316] Epoch: [2]  [730/812]  lr: 0.000018  grad_norm: 0.3199 (0.3203)  closs: 0.8372 (0.8483)  time: 1.8942  data: 0.0002  max mem: 27896
+[04:06:21.364546] Epoch: [2]  [740/812]  lr: 0.000018  grad_norm: 0.3199 (0.3204)  closs: 0.8130 (0.8483)  time: 1.8926  data: 0.0002  max mem: 27896
+[04:06:40.294476] Epoch: [2]  [750/812]  lr: 0.000018  grad_norm: 0.3025 (0.3201)  closs: 0.8130 (0.8483)  time: 1.8930  data: 0.0002  max mem: 27896
+[04:06:59.218302] Epoch: [2]  [760/812]  lr: 0.000018  grad_norm: 0.2889 (0.3199)  closs: 0.8377 (0.8477)  time: 1.8926  data: 0.0002  max mem: 27896
+[04:07:18.167960] Epoch: [2]  [770/812]  lr: 0.000017  grad_norm: 0.2964 (0.3197)  closs: 0.8568 (0.8483)  time: 1.8936  data: 0.0002  max mem: 27896
+[04:07:37.105525] Epoch: [2]  [780/812]  lr: 0.000017  grad_norm: 0.3041 (0.3196)  closs: 0.8957 (0.8489)  time: 1.8943  data: 0.0001  max mem: 27896
+[04:07:56.039500] Epoch: [2]  [790/812]  lr: 0.000017  grad_norm: 0.3081 (0.3194)  closs: 0.8753 (0.8492)  time: 1.8935  data: 0.0001  max mem: 27896
+[04:08:14.960851] Epoch: [2]  [800/812]  lr: 0.000017  grad_norm: 0.3034 (0.3192)  closs: 0.8286 (0.8495)  time: 1.8927  data: 0.0001  max mem: 27896
+[04:08:33.883409] Epoch: [2]  [810/812]  lr: 0.000016  grad_norm: 0.3034 (0.3193)  closs: 0.8577 (0.8499)  time: 1.8921  data: 0.0001  max mem: 27896
+[04:08:36.022537] Epoch: [2] Total time: 0:25:39
+[04:08:36.035493] Averaged stats: lr: 0.000016  grad_norm: 0.3008 (0.3193)  closs: 0.8684 (0.8498)
+[04:08:36.115200] model saved
+[04:08:36.872967] optimizer saved
+[04:08:36.873428] other rank-common saved
+[04:08:36.875290] rank-specific saved
+[04:08:36.880381] log_dir: ./output_dir
+[04:08:40.086697] Epoch: [3]  [0/812]  lr: 0.000016  grad_norm: 0.3132 (0.3132)  closs: 0.6514 (0.6514)  time: 3.2054  data: 1.2846  max mem: 27896
+[04:08:58.944133] Epoch: [3]  [10/812]  lr: 0.000016  grad_norm: 0.3039 (0.3061)  closs: 0.9142 (0.8930)  time: 2.0056  data: 0.1169  max mem: 27896
+[04:09:17.782690] Epoch: [3]  [20/812]  lr: 0.000016  grad_norm: 0.3034 (0.3083)  closs: 0.8751 (0.8666)  time: 1.8847  data: 0.0001  max mem: 27896
+[04:09:36.650555] Epoch: [3]  [30/812]  lr: 0.000016  grad_norm: 0.3027 (0.3036)  closs: 0.8511 (0.8649)  time: 1.8852  data: 0.0001  max mem: 27896
+[04:09:55.557013] Epoch: [3]  [40/812]  lr: 0.000015  grad_norm: 0.3062 (0.3063)  closs: 0.8427 (0.8541)  time: 1.8886  data: 0.0002  max mem: 27896
+[04:10:14.538779] Epoch: [3]  [50/812]  lr: 0.000015  grad_norm: 0.3062 (0.3108)  closs: 0.8332 (0.8547)  time: 1.8943  data: 0.0002  max mem: 27896
+[04:10:33.501826] Epoch: [3]  [60/812]  lr: 0.000015  grad_norm: 0.3091 (0.3128)  closs: 0.8533 (0.8592)  time: 1.8972  data: 0.0002  max mem: 27896
+[04:10:52.456478] Epoch: [3]  [70/812]  lr: 0.000015  grad_norm: 0.3068 (0.3104)  closs: 0.8281 (0.8562)  time: 1.8958  data: 0.0002  max mem: 27896
+[04:11:11.449969] Epoch: [3]  [80/812]  lr: 0.000014  grad_norm: 0.3006 (0.3107)  closs: 0.8402 (0.8559)  time: 1.8973  data: 0.0001  max mem: 27896
+[04:11:30.433114] Epoch: [3]  [90/812]  lr: 0.000014  grad_norm: 0.3006 (0.3130)  closs: 0.8645 (0.8548)  time: 1.8988  data: 0.0002  max mem: 27896
+[04:11:49.390336] Epoch: [3]  [100/812]  lr: 0.000014  grad_norm: 0.3083 (0.3140)  closs: 0.8268 (0.8523)  time: 1.8969  data: 0.0002  max mem: 27896
+[04:12:08.297454] Epoch: [3]  [110/812]  lr: 0.000014  grad_norm: 0.3087 (0.3158)  closs: 0.7961 (0.8464)  time: 1.8931  data: 0.0002  max mem: 27896
+[04:12:27.263638] Epoch: [3]  [120/812]  lr: 0.000013  grad_norm: 0.3062 (0.3156)  closs: 0.7961 (0.8509)  time: 1.8936  data: 0.0001  max mem: 27896
+[04:12:46.235939] Epoch: [3]  [130/812]  lr: 0.000013  grad_norm: 0.2982 (0.3144)  closs: 0.8222 (0.8480)  time: 1.8969  data: 0.0001  max mem: 27896
+[04:13:05.196913] Epoch: [3]  [140/812]  lr: 0.000013  grad_norm: 0.2880 (0.3124)  closs: 0.8459 (0.8516)  time: 1.8966  data: 0.0002  max mem: 27896
+[04:13:24.152162] Epoch: [3]  [150/812]  lr: 0.000013  grad_norm: 0.2880 (0.3148)  closs: 0.8459 (0.8508)  time: 1.8957  data: 0.0002  max mem: 27896
+[04:13:43.111343] Epoch: [3]  [160/812]  lr: 0.000012  grad_norm: 0.3095 (0.3133)  closs: 0.8125 (0.8509)  time: 1.8957  data: 0.0002  max mem: 27896
+[04:14:02.059325] Epoch: [3]  [170/812]  lr: 0.000012  grad_norm: 0.3053 (0.3137)  closs: 0.8366 (0.8496)  time: 1.8953  data: 0.0002  max mem: 27896
+[04:14:20.984571] Epoch: [3]  [180/812]  lr: 0.000012  grad_norm: 0.3173 (0.3141)  closs: 0.8201 (0.8481)  time: 1.8936  data: 0.0001  max mem: 27896
+[04:14:39.914781] Epoch: [3]  [190/812]  lr: 0.000012  grad_norm: 0.3322 (0.3155)  closs: 0.8201 (0.8478)  time: 1.8927  data: 0.0002  max mem: 27896
+[04:14:58.900447] Epoch: [3]  [200/812]  lr: 0.000012  grad_norm: 0.2994 (0.3135)  closs: 0.8613 (0.8494)  time: 1.8957  data: 0.0002  max mem: 27896
+[04:15:17.859832] Epoch: [3]  [210/812]  lr: 0.000011  grad_norm: 0.2909 (0.3131)  closs: 0.8319 (0.8483)  time: 1.8972  data: 0.0002  max mem: 27896
+[04:15:36.794509] Epoch: [3]  [220/812]  lr: 0.000011  grad_norm: 0.2960 (0.3124)  closs: 0.8319 (0.8499)  time: 1.8946  data: 0.0001  max mem: 27896
+[04:15:55.658566] Epoch: [3]  [230/812]  lr: 0.000011  grad_norm: 0.2969 (0.3122)  closs: 0.8500 (0.8504)  time: 1.8899  data: 0.0001  max mem: 27896
+[04:16:14.645590] Epoch: [3]  [240/812]  lr: 0.000011  grad_norm: 0.3129 (0.3137)  closs: 0.8391 (0.8506)  time: 1.8925  data: 0.0002  max mem: 27896
+[04:16:33.513571] Epoch: [3]  [250/812]  lr: 0.000011  grad_norm: 0.3129 (0.3137)  closs: 0.8141 (0.8500)  time: 1.8927  data: 0.0002  max mem: 27896
+[04:16:52.475377] Epoch: [3]  [260/812]  lr: 0.000010  grad_norm: 0.2952 (0.3132)  closs: 0.7776 (0.8484)  time: 1.8914  data: 0.0002  max mem: 27896
+[04:17:11.412915] Epoch: [3]  [270/812]  lr: 0.000010  grad_norm: 0.2944 (0.3123)  closs: 0.7811 (0.8488)  time: 1.8949  data: 0.0002  max mem: 27896
+[04:17:30.330446] Epoch: [3]  [280/812]  lr: 0.000010  grad_norm: 0.2960 (0.3125)  closs: 0.8017 (0.8501)  time: 1.8927  data: 0.0002  max mem: 27896
+[04:17:49.226468] Epoch: [3]  [290/812]  lr: 0.000010  grad_norm: 0.3128 (0.3135)  closs: 0.8010 (0.8492)  time: 1.8906  data: 0.0001  max mem: 27896
+[04:18:08.157379] Epoch: [3]  [300/812]  lr: 0.000010  grad_norm: 0.3259 (0.3140)  closs: 0.7914 (0.8465)  time: 1.8913  data: 0.0002  max mem: 27896
+[04:18:27.089429] Epoch: [3]  [310/812]  lr: 0.000010  grad_norm: 0.3107 (0.3136)  closs: 0.8240 (0.8483)  time: 1.8931  data: 0.0002  max mem: 27896
+[04:18:46.029402] Epoch: [3]  [320/812]  lr: 0.000009  grad_norm: 0.2942 (0.3132)  closs: 0.8801 (0.8490)  time: 1.8935  data: 0.0002  max mem: 27896
+[04:19:04.985159] Epoch: [3]  [330/812]  lr: 0.000009  grad_norm: 0.2938 (0.3127)  closs: 0.8596 (0.8487)  time: 1.8947  data: 0.0002  max mem: 27896
+[04:19:23.926985] Epoch: [3]  [340/812]  lr: 0.000009  grad_norm: 0.2990 (0.3139)  closs: 0.8847 (0.8493)  time: 1.8948  data: 0.0001  max mem: 27896
+[04:19:42.889202] Epoch: [3]  [350/812]  lr: 0.000009  grad_norm: 0.2984 (0.3136)  closs: 0.8847 (0.8498)  time: 1.8951  data: 0.0002  max mem: 27896
+[04:20:01.819354] Epoch: [3]  [360/812]  lr: 0.000009  grad_norm: 0.2945 (0.3141)  closs: 0.8506 (0.8503)  time: 1.8946  data: 0.0002  max mem: 27896
+[04:20:20.681250] Epoch: [3]  [370/812]  lr: 0.000009  grad_norm: 0.3086 (0.3139)  closs: 0.8640 (0.8506)  time: 1.8895  data: 0.0002  max mem: 27896
+[04:20:39.561245] Epoch: [3]  [380/812]  lr: 0.000008  grad_norm: 0.3031 (0.3136)  closs: 0.8254 (0.8496)  time: 1.8870  data: 0.0002  max mem: 27896
+[04:20:58.511789] Epoch: [3]  [390/812]  lr: 0.000008  grad_norm: 0.3048 (0.3138)  closs: 0.7809 (0.8486)  time: 1.8915  data: 0.0002  max mem: 27896
+[04:21:17.407643] Epoch: [3]  [400/812]  lr: 0.000008  grad_norm: 0.3048 (0.3140)  closs: 0.8496 (0.8490)  time: 1.8923  data: 0.0002  max mem: 27896
+[04:21:36.346825] Epoch: [3]  [410/812]  lr: 0.000008  grad_norm: 0.3205 (0.3144)  closs: 0.8241 (0.8476)  time: 1.8917  data: 0.0002  max mem: 27896
+[04:21:55.284941] Epoch: [3]  [420/812]  lr: 0.000008  grad_norm: 0.3215 (0.3144)  closs: 0.7919 (0.8469)  time: 1.8938  data: 0.0002  max mem: 27896
+[04:22:14.183727] Epoch: [3]  [430/812]  lr: 0.000008  grad_norm: 0.2908 (0.3151)  closs: 0.7896 (0.8455)  time: 1.8918  data: 0.0002  max mem: 27896
+[04:22:33.033058] Epoch: [3]  [440/812]  lr: 0.000008  grad_norm: 0.2986 (0.3149)  closs: 0.7896 (0.8446)  time: 1.8873  data: 0.0002  max mem: 27896
+[04:22:52.033141] Epoch: [3]  [450/812]  lr: 0.000007  grad_norm: 0.3073 (0.3153)  closs: 0.8106 (0.8442)  time: 1.8924  data: 0.0002  max mem: 27896
+[04:23:10.877552] Epoch: [3]  [460/812]  lr: 0.000007  grad_norm: 0.3192 (0.3159)  closs: 0.8106 (0.8436)  time: 1.8922  data: 0.0002  max mem: 27896
+[04:23:29.818614] Epoch: [3]  [470/812]  lr: 0.000007  grad_norm: 0.3165 (0.3162)  closs: 0.8062 (0.8433)  time: 1.8892  data: 0.0002  max mem: 27896
+[04:23:48.729837] Epoch: [3]  [480/812]  lr: 0.000007  grad_norm: 0.3106 (0.3164)  closs: 0.7980 (0.8427)  time: 1.8925  data: 0.0002  max mem: 27896
+[04:24:07.666036] Epoch: [3]  [490/812]  lr: 0.000007  grad_norm: 0.2986 (0.3160)  closs: 0.7908 (0.8428)  time: 1.8923  data: 0.0002  max mem: 27896
+[04:24:26.621467] Epoch: [3]  [500/812]  lr: 0.000007  grad_norm: 0.2896 (0.3159)  closs: 0.8088 (0.8424)  time: 1.8945  data: 0.0002  max mem: 27896
+[04:24:45.599584] Epoch: [3]  [510/812]  lr: 0.000007  grad_norm: 0.2821 (0.3161)  closs: 0.8423 (0.8438)  time: 1.8966  data: 0.0002  max mem: 27896
+[04:25:04.476389] Epoch: [3]  [520/812]  lr: 0.000007  grad_norm: 0.3012 (0.3183)  closs: 0.9362 (0.8444)  time: 1.8927  data: 0.0002  max mem: 27896
+[04:25:23.459088] Epoch: [3]  [530/812]  lr: 0.000006  grad_norm: 0.3306 (0.3186)  closs: 0.7977 (0.8438)  time: 1.8929  data: 0.0002  max mem: 27896
+[04:25:42.384155] Epoch: [3]  [540/812]  lr: 0.000006  grad_norm: 0.3257 (0.3189)  closs: 0.7825 (0.8435)  time: 1.8953  data: 0.0002  max mem: 27896
+[04:26:01.358224] Epoch: [3]  [550/812]  lr: 0.000006  grad_norm: 0.3128 (0.3201)  closs: 0.8171 (0.8426)  time: 1.8949  data: 0.0002  max mem: 27896
+[04:26:20.291928] Epoch: [3]  [560/812]  lr: 0.000006  grad_norm: 0.3103 (0.3200)  closs: 0.8064 (0.8420)  time: 1.8953  data: 0.0002  max mem: 27896
+[04:26:39.237442] Epoch: [3]  [570/812]  lr: 0.000006  grad_norm: 0.3198 (0.3202)  closs: 0.8320 (0.8427)  time: 1.8939  data: 0.0002  max mem: 27896
+[04:26:58.136155] Epoch: [3]  [580/812]  lr: 0.000006  grad_norm: 0.3220 (0.3202)  closs: 0.8468 (0.8431)  time: 1.8921  data: 0.0002  max mem: 27896
+[04:27:17.086286] Epoch: [3]  [590/812]  lr: 0.000006  grad_norm: 0.3003 (0.3201)  closs: 0.8231 (0.8426)  time: 1.8924  data: 0.0002  max mem: 27896
+[04:27:36.048124] Epoch: [3]  [600/812]  lr: 0.000006  grad_norm: 0.3098 (0.3202)  closs: 0.7716 (0.8424)  time: 1.8955  data: 0.0002  max mem: 27896
+[04:27:54.982696] Epoch: [3]  [610/812]  lr: 0.000006  grad_norm: 0.3098 (0.3201)  closs: 0.8542 (0.8431)  time: 1.8948  data: 0.0002  max mem: 27896
+[04:28:13.909296] Epoch: [3]  [620/812]  lr: 0.000006  grad_norm: 0.3018 (0.3205)  closs: 0.8435 (0.8433)  time: 1.8930  data: 0.0002  max mem: 27896
+[04:28:32.846395] Epoch: [3]  [630/812]  lr: 0.000006  grad_norm: 0.3174 (0.3209)  closs: 0.8081 (0.8426)  time: 1.8931  data: 0.0002  max mem: 27896
+[04:28:51.781440] Epoch: [3]  [640/812]  lr: 0.000006  grad_norm: 0.3213 (0.3209)  closs: 0.8105 (0.8424)  time: 1.8935  data: 0.0002  max mem: 27896
+[04:29:10.626535] Epoch: [3]  [650/812]  lr: 0.000005  grad_norm: 0.3271 (0.3213)  closs: 0.8478 (0.8436)  time: 1.8889  data: 0.0002  max mem: 27896
+[04:29:29.652082] Epoch: [3]  [660/812]  lr: 0.000005  grad_norm: 0.3271 (0.3219)  closs: 0.8954 (0.8442)  time: 1.8935  data: 0.0003  max mem: 27896
+[04:29:48.482813] Epoch: [3]  [670/812]  lr: 0.000005  grad_norm: 0.3248 (0.3219)  closs: 0.8301 (0.8436)  time: 1.8927  data: 0.0003  max mem: 27896
+[04:30:07.417393] Epoch: [3]  [680/812]  lr: 0.000005  grad_norm: 0.3113 (0.3224)  closs: 0.8336 (0.8442)  time: 1.8882  data: 0.0002  max mem: 27896
+[04:30:26.314560] Epoch: [3]  [690/812]  lr: 0.000005  grad_norm: 0.2994 (0.3220)  closs: 0.8543 (0.8439)  time: 1.8915  data: 0.0002  max mem: 27896
+[04:30:45.279799] Epoch: [3]  [700/812]  lr: 0.000005  grad_norm: 0.2994 (0.3222)  closs: 0.8287 (0.8441)  time: 1.8931  data: 0.0002  max mem: 27896
+[04:31:04.223755] Epoch: [3]  [710/812]  lr: 0.000005  grad_norm: 0.3136 (0.3222)  closs: 0.8644 (0.8445)  time: 1.8954  data: 0.0002  max mem: 27896
+[04:31:23.141454] Epoch: [3]  [720/812]  lr: 0.000005  grad_norm: 0.3024 (0.3221)  closs: 0.8557 (0.8448)  time: 1.8930  data: 0.0002  max mem: 27896
+[04:31:42.071129] Epoch: [3]  [730/812]  lr: 0.000005  grad_norm: 0.3024 (0.3218)  closs: 0.8299 (0.8441)  time: 1.8923  data: 0.0002  max mem: 27896
+[04:32:01.036945] Epoch: [3]  [740/812]  lr: 0.000005  grad_norm: 0.3053 (0.3215)  closs: 0.8202 (0.8445)  time: 1.8947  data: 0.0002  max mem: 27896
+[04:32:19.981071] Epoch: [3]  [750/812]  lr: 0.000005  grad_norm: 0.3053 (0.3215)  closs: 0.8202 (0.8441)  time: 1.8954  data: 0.0002  max mem: 27896
+[04:32:38.888418] Epoch: [3]  [760/812]  lr: 0.000005  grad_norm: 0.2892 (0.3214)  closs: 0.8244 (0.8446)  time: 1.8925  data: 0.0002  max mem: 27896
+[04:32:57.804891] Epoch: [3]  [770/812]  lr: 0.000005  grad_norm: 0.2988 (0.3216)  closs: 0.8321 (0.8446)  time: 1.8911  data: 0.0002  max mem: 27896
+[04:33:16.740653] Epoch: [3]  [780/812]  lr: 0.000005  grad_norm: 0.2995 (0.3220)  closs: 0.8274 (0.8445)  time: 1.8925  data: 0.0001  max mem: 27896
+[04:33:35.601422] Epoch: [3]  [790/812]  lr: 0.000005  grad_norm: 0.3101 (0.3220)  closs: 0.8274 (0.8445)  time: 1.8898  data: 0.0001  max mem: 27896
+[04:33:54.572764] Epoch: [3]  [800/812]  lr: 0.000005  grad_norm: 0.3027 (0.3219)  closs: 0.8614 (0.8449)  time: 1.8915  data: 0.0001  max mem: 27896
+[04:34:13.455842] Epoch: [3]  [810/812]  lr: 0.000005  grad_norm: 0.3061 (0.3223)  closs: 0.9174 (0.8456)  time: 1.8927  data: 0.0001  max mem: 27896
+[04:34:15.578241] Epoch: [3] Total time: 0:25:38
+[04:34:15.579225] Averaged stats: lr: 0.000005  grad_norm: 0.3140 (0.3224)  closs: 0.9157 (0.8461)
+[04:34:15.661264] model saved
+[04:34:16.446718] optimizer saved
+[04:34:16.447181] other rank-common saved
+[04:34:16.449046] rank-specific saved
+[04:34:16.449254] Training time 1:42:44
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8f3ba8a74edeb56d30f33b93229d72ff0893234d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57840f6a94b317e5216383955e671d5c485aa459e9a77c46399004b14aab3449
+size 16308187
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e0aa9f434f0cc0a821265ec66357c81590a16153
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98e48515afe7b737d914e6a945fddc635b1f3bd53d716da73568279b19d4e9fc
+size 64801559
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6c3e73a9d547910fb183caa16be9b2f9ed5a7866
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15ad86707b40e58d9f1e5b304e7f138c74be797e45cdcd6c9c3e67d1ddea2a8b
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bf8b0b7d0020d9c161451e9628f04abe5fbbfd7d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95c28c92604c98733f7d11de93aabb7b91bf51cf6d5d1b4a7648f88735df9be8
+size 16308187
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4ae586e8704515a765d6489bf6c23b882d96dd76
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:726e9afa5e22acbf9e238aaa616537ff8bc766c7a407d50f49fd184a85596d3f
+size 64801559
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..db1a4f53cdd71b119a332dfc0460aacb6dcba83d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3ffbc6452c328bba47a43a12dbd4bc293c231de8f57a6bd819aa611ed703d60
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0b6b9e7e744aa00e7901cd325ae5b2ca70edd02f
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71c909bd4009747dcecd359334b72083ce7d70ba611d0835b3f9e805633df345
+size 16308187
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8209a0373966755123d2c2a67c9f7f3716dad120
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8af183f8bb45d2e6aecf19b37dd23b380900961ab1df49a422c47a88086dd99b
+size 64801559
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..654e95a72619e275667b7e2858b830b4eb336d60
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e1407e17b55be720204de47db0e9d89c18253e4bd99ce9beecf96812ad9220b
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fdb8a6396561e7587d5763107287361c3b7d2a65
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ca963a2250f78056df0749f6d91daf572f7e87a00398a2ea04fb8e0d4fb2981
+size 16308187
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28bd47df51e7ce248d00ef3d9ee2ca5cd4821c25
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f6388325fba6658d8f5f383cb6affc164930411672f0944d7f249e10a03b78
+size 64801559
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d5680517ef1395e38ea1a66b086e2213b58700f0
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99c45bf04236026e6ce60807e96c85d649fb32f7065883be1052037d816478dc
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth
new file mode 100644
index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e2ec26280e83c14eb2d09d4d2d0ccd17ebb3ede7
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt
@@ -0,0 +1,4 @@
+{"train_lr": 2.49923076923077e-05, "train_closs": 0.7980487976441016, "train_grad_norm": 0.5980251384056532, "epoch": 0, "val_lr": 2.49923076923077e-05, "val_closs": 0.7980487976441016, "val_grad_norm": 0.5980251384056532}
+{"train_lr": 4.6109080828728024e-05, "train_closs": 0.7623572307492678, "train_grad_norm": 0.45453824085914174, "epoch": 1, "val_lr": 4.6109080828728024e-05, "val_closs": 0.7623572307492678, "val_grad_norm": 0.45453824085914174}
+{"train_lr": 2.750346153846151e-05, "train_closs": 0.750338752788993, "train_grad_norm": 0.46191218195511746, "epoch": 2, "val_lr": 2.750346153846151e-05, "val_closs": 0.750338752788993, "val_grad_norm": 0.46191218195511746}
+{"train_lr": 8.894380709733404e-06, "train_closs": 0.742047518081963, "train_grad_norm": 0.47685301401064945, "epoch": 3, "val_lr": 8.894380709733404e-06, "val_closs": 0.742047518081963, "val_grad_norm": 0.47685301401064945}
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..7237f8e9f2027c7ccb41d96c4cfe451ad95ec056
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log
@@ -0,0 +1,7130 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 2): env://, gpu 2
+| distributed init (rank 3): env://, gpu 3
+| distributed init (rank 0): env://, gpu 0
+[16:45:57.921447] > initializing model parallel with size 1
+[16:45:57.921566] > initializing ddp with size 4
+[16:45:57.921573] > initializing pipeline with size 1
+[16:45:58.007928] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[16:45:58.008031] Namespace(batch_size=4,
+accum_iter=2,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-70b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B',
+log_dir='./output_dir',
+save_interval=1,
+device='cuda',
+seed=0,
+resume='',
+num_workers=4,
+pin_mem=True,
+world_size=4,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[16:45:58.019987] Model Args:
+ ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[16:54:32.881369] Model is Peft: True
+[16:54:32.888802] Trainable parameter count : 8036352 (local rank), 8036352 (all).
+[16:54:32.917071] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917111] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917124] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.917137] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917147] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917159] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917169] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917180] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917190] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917204] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917214] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917226] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917236] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917247] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917257] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917269] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917279] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917293] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917304] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.917315] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917325] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917336] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917346] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917357] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917367] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917379] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917389] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917400] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917410] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917421] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917431] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917442] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917452] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917466] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917475] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.917487] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917497] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917508] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917522] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917533] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917543] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917555] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917565] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917576] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917585] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917596] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917606] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917617] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917627] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917641] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917650] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.917662] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917671] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917682] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917692] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917703] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917713] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917725] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917735] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917746] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917755] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917767] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917777] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917787] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917799] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917812] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917821] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.917832] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917842] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917853] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917863] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.917874] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917884] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917896] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917905] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917917] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917928] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917939] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917949] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.917959] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917970] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.917984] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.917993] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918004] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918014] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918025] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918035] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918046] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918055] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918067] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918077] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918088] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918098] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918109] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918119] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918130] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918142] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918155] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918165] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918176] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918186] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918197] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918207] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918217] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918227] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918239] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918249] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918260] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918270] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918281] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918291] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918302] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918313] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918326] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918335] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918346] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918356] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918367] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918377] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918388] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918398] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918410] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918420] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918431] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918440] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918451] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918461] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918472] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918482] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918496] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918505] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918516] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918526] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918537] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918547] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918558] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918567] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918579] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918589] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918600] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918610] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918621] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918631] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918642] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918653] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918666] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918676] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918687] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918696] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918708] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918717] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918728] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918738] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918750] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918759] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918771] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918780] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918792] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918801] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918812] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918823] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918836] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918846] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.918857] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918867] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918878] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918887] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.918898] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918908] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918920] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918930] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918941] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918951] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918962] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.918971] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.918982] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.918993] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919006] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919016] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919026] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919036] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919047] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919057] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919068] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919078] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919090] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919100] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919111] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919121] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919132] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919142] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919152] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919163] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919176] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919186] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919197] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919207] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919218] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919228] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919239] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919248] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919260] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919270] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919281] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919291] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919301] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919311] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919322] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919333] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919346] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919355] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919367] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919376] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919387] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919397] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919408] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919418] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919430] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919440] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919451] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919460] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919472] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919481] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919492] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919503] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919516] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919526] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919537] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919547] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919558] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919567] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919578] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919588] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919600] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919610] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919621] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919631] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919642] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919652] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919662] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919673] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919686] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919696] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919707] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919717] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919728] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919738] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919749] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919758] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919770] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919780] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919791] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919801] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919812] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919822] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919832] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919843] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919856] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919866] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.919877] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919887] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919898] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919907] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.919918] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919928] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919940] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919950] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.919961] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919970] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.919981] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.919991] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920003] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920013] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920027] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920036] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920047] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920057] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920068] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920078] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920089] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920098] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920110] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920120] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920131] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920141] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920152] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920162] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920172] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920183] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920196] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920206] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920216] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920226] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920237] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920247] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920258] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920267] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920279] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920289] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920300] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920310] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920321] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920330] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920341] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920352] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920365] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920379] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920390] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920400] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920411] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920420] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920432] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920441] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920453] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920463] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920474] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920484] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920495] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920505] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920516] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920526] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920539] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920549] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920560] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920570] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920581] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920590] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920601] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920611] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920623] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920632] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920644] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920653] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920664] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920674] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920685] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920696] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920709] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920718] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920729] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920739] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920750] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920760] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920771] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920781] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920792] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920802] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920813] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920823] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920834] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920844] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920855] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920866] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920879] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920888] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.920899] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920909] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920920] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920930] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.920941] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920950] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.920962] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920972] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.920983] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.920992] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921003] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921013] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921024] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921035] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921048] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921057] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921068] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921078] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921089] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921099] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921110] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921120] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921132] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921142] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921153] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921163] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921174] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921183] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921194] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921205] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921218] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921228] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921239] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921249] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921260] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921270] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921280] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921290] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921302] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921312] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921323] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921333] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921344] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921354] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921365] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921375] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921391] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921401] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921412] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921422] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921433] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921443] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921454] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921463] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921475] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921485] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921496] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921506] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921520] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921530] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921541] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921552] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921565] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921575] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921587] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921596] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921607] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921617] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921628] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921638] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921650] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921659] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921670] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921680] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921691] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921701] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921711] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921722] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921735] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921745] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921756] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921766] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921777] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921786] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921797] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921807] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921819] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921828] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921840] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921849] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921861] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921870] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.921881] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921892] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921905] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921914] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.921925] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921935] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921946] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921956] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.921967] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921977] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.921988] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.921998] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922009] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922018] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922029] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922039] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922050] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922063] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922077] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922087] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922098] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922108] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922120] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922130] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922142] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922152] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922164] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922174] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922185] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922195] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922206] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922216] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922227] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922238] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922251] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922260] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922272] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922281] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922293] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922302] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922313] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922323] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922335] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922345] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922356] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922366] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922377] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922386] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922397] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922408] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922421] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922431] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922442] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922452] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922463] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922473] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922484] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922494] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922506] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922516] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922528] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922538] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922549] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922559] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922570] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922581] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922594] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922604] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922616] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922625] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922637] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922647] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922658] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922668] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922680] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922690] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922701] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922711] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922722] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922732] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922743] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922754] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922767] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922777] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922788] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922798] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922809] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922819] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922830] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922840] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922853] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922863] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922874] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922884] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922895] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922905] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.922916] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922926] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.922939] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922949] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.922961] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922971] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.922982] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.922992] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923003] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923013] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923025] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923035] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923046] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923056] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923067] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923076] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923087] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923098] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923111] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923121] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923132] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923142] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923153] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923163] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923174] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923184] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923196] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923205] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923216] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923226] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923237] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923247] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923258] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923269] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923282] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923292] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923303] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923312] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923324] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923333] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923344] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923354] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923366] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923376] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923387] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923396] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923407] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923417] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923427] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923438] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923451] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923461] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923472] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923482] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923493] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923503] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923514] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923524] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923536] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923546] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923557] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923566] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923578] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923587] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923598] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923609] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923622] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923632] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923643] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923653] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923664] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923673] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923684] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923694] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923705] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923714] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923724] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923733] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923744] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923752] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923762] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923772] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923784] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923793] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923804] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923812] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923823] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923831] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923842] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923850] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923861] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923870] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923880] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923889] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923899] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923908] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.923917] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923927] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.923939] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923948] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.923958] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923967] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923977] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.923986] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.923996] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924004] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924016] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924024] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924034] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924043] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924053] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924062] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924072] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924081] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924093] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924102] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924112] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924121] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924131] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924139] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924150] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924158] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924170] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924178] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924188] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924197] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924207] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924216] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924227] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924237] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924249] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924258] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924268] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924277] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924287] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924296] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924306] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924314] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924326] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924334] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924345] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924353] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924363] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924372] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924382] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924392] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924403] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924412] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924422] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924431] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924441] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924450] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924460] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924469] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924480] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924490] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924500] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924509] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924519] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924528] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924538] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924548] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924560] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924569] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924579] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924588] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924598] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924607] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924617] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924625] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924636] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924645] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924655] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924664] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924674] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924683] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924693] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924702] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924715] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924723] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924734] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924742] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924753] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924762] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924772] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924781] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924793] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924802] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924812] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924821] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924831] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924840] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924850] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924860] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924871] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924880] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.924890] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924899] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924909] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924917] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.924927] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924936] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924947] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924956] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.924966] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924975] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.924985] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.924993] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925003] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925015] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925028] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925036] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925046] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925055] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925065] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925074] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925084] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925093] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925103] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925112] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925122] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925131] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925141] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925150] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925160] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925169] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925181] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925190] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925200] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925209] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925219] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925228] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925238] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925246] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925257] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925266] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925277] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925286] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925296] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925305] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925315] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925325] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925337] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925346] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925356] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925365] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925375] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925384] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925394] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925402] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925413] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925422] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925432] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925441] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925451] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925460] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925470] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925479] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925491] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925500] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925510] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925523] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925533] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925542] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925552] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925561] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925572] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925581] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925591] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925599] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925609] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925618] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925628] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925638] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925650] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925658] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925669] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925677] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925687] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925696] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925706] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925715] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925726] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925734] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925744] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925753] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925763] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925772] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925782] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925792] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925805] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925814] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925825] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925833] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925843] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925852] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925862] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925871] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925882] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925891] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925901] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925909] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925919] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925928] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.925938] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925947] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.925959] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925968] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.925978] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.925987] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.925997] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926006] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926016] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926025] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926036] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926044] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926054] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926064] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926074] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926083] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926093] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926103] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926114] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926123] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926134] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926142] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926152] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926161] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926171] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926180] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926191] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926200] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926210] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926219] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926229] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926238] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926248] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926258] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926270] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926279] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926289] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926298] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926308] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926317] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926328] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926338] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926349] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926358] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926368] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926377] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926387] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926396] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926406] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926417] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926430] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926438] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926448] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926457] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926467] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926476] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926486] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926494] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926505] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926514] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926524] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926533] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926543] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926552] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926561] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926571] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926583] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926592] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926602] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926611] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926621] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926630] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926640] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926649] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926660] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926669] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926679] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926687] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926697] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926706] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926716] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926726] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926738] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926746] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926756] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926765] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926775] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926783] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926793] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926802] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926813] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926822] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926832] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926841] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926852] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926861] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926871] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926880] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926892] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926901] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.926911] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926920] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926930] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926939] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.926949] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926958] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.926969] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926978] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.926988] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.926997] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927007] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927016] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927025] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927035] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927048] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927056] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927066] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927075] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927085] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927094] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927104] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927114] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927125] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927133] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927143] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927152] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927162] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927171] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927181] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927190] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927202] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927211] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927221] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927229] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927240] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927249] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927259] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927268] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927279] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927288] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927298] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927306] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927317] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927325] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927335] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927345] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927357] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927365] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927376] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927385] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927396] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927405] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927415] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927424] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927435] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927444] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927454] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927463] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927473] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927482] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927492] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927501] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927513] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927522] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927532] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927541] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927551] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927560] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927571] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927580] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927592] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927601] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927611] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927620] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927630] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927639] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927649] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927659] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927671] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927680] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927690] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927699] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927709] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927717] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927727] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927736] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927747] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927756] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927766] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927775] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927785] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927793] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927803] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927813] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927825] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927834] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.927845] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927853] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927864] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927873] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.927883] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927891] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927903] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927912] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927922] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927931] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927941] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927950] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.927960] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927969] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.927982] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.927991] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928001] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928010] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928020] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928028] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928039] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928047] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928058] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928067] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928077] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928086] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928096] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928105] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928115] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928124] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928136] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928145] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928155] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928165] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928175] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928184] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928194] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928203] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928214] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928223] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928233] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928242] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928252] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928260] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928270] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928280] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928292] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928301] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928312] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928320] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928331] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928340] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928349] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928358] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928369] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928378] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928388] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928397] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928407] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928415] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928426] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928436] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928448] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928457] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928467] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928476] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928486] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928495] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928505] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928514] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928525] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928534] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928543] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928552] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928562] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928571] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928581] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928591] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928603] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928612] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928622] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928631] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928641] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928650] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928660] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928669] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928680] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928690] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928700] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928709] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928719] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928728] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928738] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928747] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928759] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928768] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928778] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928787] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928797] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928806] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928816] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928825] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928835] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928844] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928854] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928863] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928873] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928882] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.928892] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928902] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928913] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928922] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.928932] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928941] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928953] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928962] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.928972] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.928981] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.928992] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929001] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929012] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929020] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929031] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929040] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929050] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929060] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929072] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929081] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929091] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929100] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929110] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929119] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929130] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929139] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929150] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929159] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929169] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929178] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929188] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929196] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929206] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929216] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929228] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929237] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929247] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929256] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929266] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929275] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929285] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929293] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929304] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929313] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929323] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929332] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929342] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929351] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929361] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929371] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929383] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929392] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929402] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929411] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929422] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929430] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929440] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929449] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929460] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929469] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929480] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929489] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929499] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929508] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929523] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929533] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929545] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929554] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929565] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929573] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929584] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929592] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929603] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929611] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929623] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929632] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929642] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929670] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929680] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929689] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929699] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929709] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929721] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929730] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929740] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929749] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929760] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929770] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929780] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929788] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929799] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929808] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929818] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929827] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929837] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929846] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929856] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929866] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929878] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929887] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.929897] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929906] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929916] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929925] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.929934] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929943] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929954] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929962] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.929972] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.929981] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.929991] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930000] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.930010] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930019] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930032] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930041] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[16:54:32.930051] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930060] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.930070] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930079] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[16:54:32.930089] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930098] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930109] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930118] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.930128] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930136] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930146] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930155] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[16:54:32.930165] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930175] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930186] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[16:54:32.930199] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[16:54:32.930222] load pretrained from ../checkpoints/llama2/Llama-2-70b/
+[16:54:32.930227] Quantizing model to 4bit!
+Traceback (most recent call last):
+Traceback (most recent call last):
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in <module>
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in <module>
+Traceback (most recent call last):
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in <module>
+Traceback (most recent call last):
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in <module>
+    main(args)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main
+        main(args)main(args)
+    
+main(args)  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main
+
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main
+    load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model
+    load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type)    
+load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model
+    load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model
+    local_state_dict = _load_checkpoint_and_merge_ranks(
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks
+        local_state_dict = _load_checkpoint_and_merge_ranks(local_state_dict = _load_checkpoint_and_merge_ranks(
+
+      File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks
+shard = torch.load(ckpt_files[shard_id], map_location="cpu")
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load
+    local_state_dict = _load_checkpoint_and_merge_ranks(
+      File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks
+shard = torch.load(ckpt_files[shard_id], map_location="cpu")
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load
+    shard = torch.load(ckpt_files[shard_id], map_location="cpu")
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load
+    shard = torch.load(ckpt_files[shard_id], map_location="cpu")
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load
+    return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+      File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load
+return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load
+    return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load
+    return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load
+    result = unpickler.load()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load
+    result = unpickler.load()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load
+    result = unpickler.load()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load
+    result = unpickler.load()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load
+    typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor
+    typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor
+    typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor
+    typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor
+    storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
+RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory)
+    storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
+    RuntimeErrorstorage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage: 
+[enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory)
+RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory)
+    storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
+RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory)
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 106890) of binary: /data/anaconda3/envs/accessory/bin/python3.10
+Traceback (most recent call last):
+  File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
+    run(args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+main_finetune.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+  time      : 2023-08-10_16:55:46
+  host      : iZ2ze8qpzapxkhyc9k2qojZ
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 106891)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[2]:
+  time      : 2023-08-10_16:55:46
+  host      : iZ2ze8qpzapxkhyc9k2qojZ
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 106892)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+[3]:
+  time      : 2023-08-10_16:55:46
+  host      : iZ2ze8qpzapxkhyc9k2qojZ
+  rank      : 3 (local_rank: 3)
+  exitcode  : 1 (pid: 106893)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-10_16:55:46
+  host      : iZ2ze8qpzapxkhyc9k2qojZ
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 106890)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 0): env://, gpu 0
+[17:01:32.060306] > initializing model parallel with size 1
+[17:01:32.060361] > initializing ddp with size 2
+[17:01:32.060366] > initializing pipeline with size 1
+[17:01:32.134298] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[17:01:32.134363] Namespace(batch_size=4,
+accum_iter=2,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-70b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B',
+log_dir='./output_dir',
+save_interval=1,
+device='cuda',
+seed=0,
+resume='',
+num_workers=4,
+pin_mem=True,
+world_size=2,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[17:01:32.145104] Model Args:
+ ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 1): env://, gpu 1
+[17:01:52.288410] > initializing model parallel with size 1
+[17:01:52.288524] > initializing ddp with size 2
+[17:01:52.288530] > initializing pipeline with size 1
+[17:01:52.344237] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[17:01:52.344328] Namespace(batch_size=4,
+accum_iter=2,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-70b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B',
+log_dir='./output_dir',
+save_interval=1,
+device='cuda',
+seed=0,
+resume='',
+num_workers=8,
+pin_mem=True,
+world_size=2,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[17:01:52.354679] Model Args:
+ ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[17:09:51.764818] Model is Peft: True
+[17:09:51.772293] Trainable parameter count : 8036352 (local rank), 8036352 (all).
+[17:09:51.796307] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796346] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796359] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.796372] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796383] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796395] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796405] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796416] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796427] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796440] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796450] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796461] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796471] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796482] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796492] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796503] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796514] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796528] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796539] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.796550] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796560] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796572] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796581] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796592] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796602] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796614] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796624] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796635] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796645] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796656] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796666] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796677] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796688] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796701] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796711] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.796722] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796732] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796743] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796753] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796764] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796774] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796785] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796795] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796806] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796816] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796827] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796837] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796848] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796858] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796871] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796881] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.796892] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796902] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796913] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796923] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.796934] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796944] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796956] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796966] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.796977] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.796986] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.796997] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797008] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797019] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797029] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797042] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797052] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797064] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797073] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797085] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797094] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797105] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797115] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797127] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797137] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797148] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797159] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797170] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797180] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797191] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797202] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797215] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797224] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797235] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797245] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797256] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797266] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797277] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797287] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797299] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797308] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797320] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797329] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797340] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797350] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797361] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797373] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797386] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797396] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797408] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797417] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797428] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797438] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797449] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797459] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797471] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797481] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797492] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797502] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797517] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797527] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797538] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797551] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797564] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797575] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797586] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797596] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797607] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797617] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797629] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797639] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797651] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797661] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797672] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797682] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797694] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797703] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797714] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797725] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797738] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797748] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797759] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797769] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797780] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797790] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797801] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797810] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797822] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797832] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797843] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797853] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797864] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797874] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.797885] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797895] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797908] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797918] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.797929] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797939] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797950] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797960] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.797971] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.797981] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.797993] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798002] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798013] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798023] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798034] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798044] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798055] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798066] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798079] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798089] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798100] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798109] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798120] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798130] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798141] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798151] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798163] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798173] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798184] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798193] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798204] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798214] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798225] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798235] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798248] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798258] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798269] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798278] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798290] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798299] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798310] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798320] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798331] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798341] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798352] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798362] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798373] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798383] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798393] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798404] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798417] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798427] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798438] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798448] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798459] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798469] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798480] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798490] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798502] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798511] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798522] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798532] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798543] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798553] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798564] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798574] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798587] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798597] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798608] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798617] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798629] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798638] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798649] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798659] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798671] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798681] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798692] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798701] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798712] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798722] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798733] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798743] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798756] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798766] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798777] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798787] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798798] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798808] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798819] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798829] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798841] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798851] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798862] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798871] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798882] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798892] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.798903] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798914] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.798926] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798936] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.798948] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798957] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798968] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798978] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.798989] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.798999] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799011] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799020] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799031] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799041] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799052] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799062] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799073] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799083] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799096] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799106] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799117] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799127] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799138] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799148] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799158] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799168] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799180] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799190] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799201] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799211] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799222] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799231] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799242] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799253] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799266] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799276] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799287] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799296] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799308] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799318] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799329] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799338] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799350] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799360] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799371] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799380] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799391] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799401] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799412] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799422] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799436] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799446] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799457] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799466] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799478] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799487] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799498] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799508] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799520] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799530] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799541] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799550] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799561] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799571] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799582] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799593] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799605] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799619] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799631] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799640] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799652] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799661] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799672] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799682] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799694] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799703] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799715] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799724] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799735] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799745] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799756] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799766] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799779] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799789] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799800] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799810] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799821] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799831] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799842] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799851] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799864] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799873] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799884] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799894] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799905] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799914] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.799925] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799936] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.799949] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799959] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.799970] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.799979] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.799990] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800000] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800011] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800021] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800033] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800043] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800054] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800063] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800074] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800084] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800095] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800106] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800119] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800128] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800139] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800149] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800160] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800170] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800181] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800191] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800203] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800213] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800223] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800233] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800244] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800254] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800265] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800276] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800289] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800299] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800310] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800320] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800330] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800340] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800351] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800361] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800373] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800382] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800393] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800403] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800414] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800424] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800435] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800446] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800459] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800469] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800480] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800489] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800500] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800510] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800521] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800531] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800543] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800553] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800564] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800574] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800585] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800594] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800605] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800616] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800631] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800641] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800652] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800662] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800673] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800683] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800694] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800704] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800715] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800725] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800736] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800746] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800757] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800766] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800777] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800788] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800801] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800810] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800822] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800831] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800842] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800852] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.800863] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800872] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800885] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800894] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800905] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800915] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800926] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800936] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.800946] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800957] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.800970] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.800980] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.800991] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801001] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801012] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801021] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801032] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801042] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801054] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801063] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801074] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801084] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801095] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801105] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801115] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801126] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801139] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801149] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.801160] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801170] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801181] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801190] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801202] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801211] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801223] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801233] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801244] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801253] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801264] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801274] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801285] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801296] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801308] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801318] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.801329] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801339] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801349] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801359] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801370] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801380] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801391] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801401] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801412] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801422] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801433] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801442] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801453] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801464] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801476] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801486] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.801497] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801507] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801521] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801531] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801542] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801552] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801565] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801575] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801586] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801596] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801607] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801617] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801628] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801639] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801652] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801662] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.801673] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801683] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801694] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801704] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801715] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801725] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801737] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801747] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801758] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801767] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801778] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801788] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801799] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801809] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801822] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801832] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.801843] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801853] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801865] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801874] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.801885] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801895] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801907] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801917] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801928] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801938] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801949] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.801959] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.801969] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801980] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.801993] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802003] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802014] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802024] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802035] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802044] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802055] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802065] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802077] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802087] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802098] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802108] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802119] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802128] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802139] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802150] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802163] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802173] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802184] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802193] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802204] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802214] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802225] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802235] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802248] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802258] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802269] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802278] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802290] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802299] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802310] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802321] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802334] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802344] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802355] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802365] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802376] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802386] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802397] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802407] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802418] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802428] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802439] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802449] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802460] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802469] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802480] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802491] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802504] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802514] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802525] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802534] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802546] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802555] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802566] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802576] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802588] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802597] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802609] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802618] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802629] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802639] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802650] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802661] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802674] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802684] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802695] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802705] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802716] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802726] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802737] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802747] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802759] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802768] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802779] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802789] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802800] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802810] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802821] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802831] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802844] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802854] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.802865] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802875] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802886] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802896] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.802907] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802917] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802929] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802938] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802949] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802959] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.802970] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.802980] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.802991] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803001] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803014] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803024] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803035] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803045] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803056] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803065] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803076] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803086] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803098] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803108] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803119] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803128] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803140] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803149] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803160] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803171] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803183] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803193] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803204] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803214] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803225] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803235] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803246] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803255] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803267] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803277] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803288] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803298] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803309] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803319] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803330] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803340] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803353] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803363] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803374] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803384] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803395] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803404] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803415] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803425] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803437] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803446] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803457] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803467] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803478] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803488] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803499] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803510] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803523] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803533] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803544] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803553] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803565] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803574] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803585] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803595] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803607] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803617] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803628] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803638] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803649] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803659] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803669] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803680] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803693] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803703] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803714] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803723] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803735] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803744] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803755] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803765] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803777] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803786] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803797] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803807] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803818] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803828] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803839] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803850] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803862] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803872] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.803883] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803893] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803904] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803914] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.803925] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803934] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803946] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803956] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.803967] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803977] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.803988] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.803998] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804009] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804019] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804032] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804042] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804053] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804063] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804074] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804083] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804094] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804104] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804116] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804126] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804137] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804146] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804157] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804167] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804178] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804189] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804202] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804211] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804222] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804232] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804243] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804253] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804264] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804273] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804285] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804295] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804306] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804316] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804327] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804337] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804348] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804358] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804371] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804381] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804392] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804402] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804413] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804423] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804434] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804443] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804455] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804465] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804476] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804485] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804496] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804506] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804517] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804528] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804541] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804551] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804562] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804571] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804583] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804592] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804603] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804613] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804625] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804634] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804645] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804655] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804666] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804676] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804687] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804698] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804710] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804720] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804731] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804741] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804752] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804762] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804773] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804783] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804795] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804804] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804815] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804825] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804836] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804846] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804857] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804868] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804881] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804891] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.804902] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804912] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804923] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804933] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.804944] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804953] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.804965] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804974] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.804985] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.804995] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805006] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805016] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805027] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805037] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805050] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805060] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805071] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805081] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805092] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805102] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805113] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805122] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805134] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805144] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805155] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805165] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805176] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805185] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805196] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805207] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805220] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805230] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805241] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805251] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805262] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805272] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805283] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805293] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805305] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805315] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805325] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805335] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805346] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805356] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805367] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805377] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805390] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805400] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805411] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805421] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805432] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805442] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805453] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805463] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805475] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805485] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805496] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805505] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805519] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805529] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805540] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805551] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805564] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805575] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805587] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805596] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805607] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805617] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805628] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805638] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805650] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805659] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805670] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805680] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805691] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805701] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805712] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805722] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805735] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805745] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805756] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805766] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805777] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805787] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805798] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805807] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805819] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805829] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805840] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805849] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805860] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805870] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.805881] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805892] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805905] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805915] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.805926] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805936] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805947] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805956] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.805967] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805977] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.805989] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.805999] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806010] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806020] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806031] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806040] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806051] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806062] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806075] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806085] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806096] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806106] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806117] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806127] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806138] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806147] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806159] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806169] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806180] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806190] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806201] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806210] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806221] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806232] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806245] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806254] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806266] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806276] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806287] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806297] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806308] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806317] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806329] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806339] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806350] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806360] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806371] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806380] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806391] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806402] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806415] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806425] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806436] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806445] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806457] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806466] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806477] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806487] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806499] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806509] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806520] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806529] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806540] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806550] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806561] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806572] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806585] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806594] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806605] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806615] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806626] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806636] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806647] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806657] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806669] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806678] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806689] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806699] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806710] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806720] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806731] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806741] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806754] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806764] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806775] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806785] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806796] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806806] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806817] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806827] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806839] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806848] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806860] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806869] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806880] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806890] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.806901] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806912] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.806925] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806934] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.806946] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806955] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806966] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806976] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.806987] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.806997] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807009] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807019] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807030] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807040] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807051] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807060] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807071] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807082] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807095] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807105] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807116] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807126] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807137] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807146] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807158] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807168] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807180] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807190] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807201] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807211] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807222] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807232] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807242] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807253] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807266] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807276] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807287] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807297] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807308] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807317] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807329] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807338] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807350] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807360] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807371] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807380] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807391] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807401] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807412] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807423] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807435] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807445] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807456] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807466] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807477] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807486] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807497] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807507] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807519] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807528] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807539] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807549] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807560] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807570] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807580] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807591] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807604] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807614] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807625] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807634] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807645] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807655] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807666] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807675] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807687] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807697] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807708] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807717] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807728] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807738] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807749] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807760] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807773] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807782] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807793] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807803] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807814] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807824] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807835] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807844] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807856] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807866] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807877] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807886] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807897] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807907] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.807918] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807929] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.807942] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807951] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.807962] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807972] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.807983] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.807993] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808004] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808013] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808025] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808035] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808046] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808056] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808067] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808077] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808088] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808098] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808111] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808121] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808132] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808141] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808152] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808162] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808173] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808183] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808195] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808204] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808215] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808225] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808236] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808246] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808257] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808267] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808280] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808290] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808301] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808310] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808321] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808331] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808342] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808351] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808363] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808373] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808384] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808394] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808405] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808414] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808425] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808436] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808449] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808459] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808470] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808479] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808491] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808500] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808511] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808521] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808532] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808542] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808553] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808563] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808574] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808584] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808595] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808605] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808618] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808628] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808639] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808648] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808660] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808669] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808680] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808690] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808702] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808711] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808722] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808732] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808743] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808753] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808764] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808775] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808787] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808797] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808808] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808817] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808829] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808838] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808849] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808859] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808871] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808880] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808891] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808901] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808912] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808922] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.808932] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808943] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.808956] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808966] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.808977] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.808986] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.808997] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809007] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.809018] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809028] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809039] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809049] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809060] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809070] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809081] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809091] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809102] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809112] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809125] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809134] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.809145] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809155] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.809166] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809176] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.809187] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809196] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809208] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809217] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809229] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809238] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809249] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809259] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809270] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809281] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809294] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809303] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.809314] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809324] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.809335] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809344] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.809355] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809365] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809377] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809387] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809398] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809423] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809435] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809444] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.809455] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809466] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.809479] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809488] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.809500] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.809509] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.818748] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818762] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.818775] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818785] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.818798] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818808] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.818819] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818829] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.818840] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818850] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.818861] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.818871] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.818885] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818894] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.818906] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818916] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.818927] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818936] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.818947] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818957] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.818969] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818978] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.818989] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.818999] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819010] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819019] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.819030] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819041] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819054] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819064] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[17:09:51.819075] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819084] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.819095] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819105] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[17:09:51.819116] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819126] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819138] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819148] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.819159] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819168] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819179] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819189] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[17:09:51.819200] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819210] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819222] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[17:09:51.819235] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[17:09:51.819266] load pretrained from ../checkpoints/llama2/Llama-2-70b/
+[17:09:51.819272] Quantizing model to 4bit!
+  0%|          | 0/967 [00:00<?, ?it/s]  1%|          | 8/967 [00:00<00:42, 22.55it/s]  1%|          | 11/967 [00:00<01:13, 13.00it/s]  1%|▏         | 13/967 [00:01<03:11,  4.98it/s]  1%|▏         | 14/967 [00:03<05:26,  2.92it/s]  2%|▏         | 15/967 [00:04<07:40,  2.07it/s]  2%|▏         | 20/967 [00:04<03:56,  4.00it/s]  2%|▏         | 23/967 [00:04<03:21,  4.69it/s]  3%|▎         | 25/967 [00:06<04:39,  3.37it/s]  3%|▎         | 26/967 [00:07<06:32,  2.40it/s]  3%|▎         | 27/967 [00:08<08:26,  1.86it/s]  3%|▎         | 32/967 [00:08<04:19,  3.60it/s]  4%|▎         | 35/967 [00:09<03:36,  4.31it/s]  4%|▍         | 37/967 [00:10<04:48,  3.23it/s]  4%|▍         | 38/967 [00:11<06:35,  2.35it/s]  4%|▍         | 39/967 [00:12<08:26,  1.83it/s]  5%|▍         | 44/967 [00:12<04:21,  3.54it/s]  5%|▍         | 47/967 [00:13<03:36,  4.25it/s]  5%|▌         | 49/967 [00:14<04:46,  3.20it/s]  0%|          | 0/967 [00:00<?, ?it/s]  1%|          | 8/967 [00:00<00:48, 19.88it/s]  1%|          | 11/967 [00:00<01:23, 11.46it/s]  5%|▌         | 50/967 [00:15<06:35,  2.32it/s]  1%|▏         | 13/967 [00:02<03:25,  4.63it/s]  5%|▌         | 51/967 [00:16<08:22,  1.82it/s]  6%|▌         | 56/967 [00:16<04:18,  3.52it/s]  6%|▌         | 59/967 [00:17<03:34,  4.23it/s]  1%|▏         | 14/967 [00:03<05:58,  2.66it/s]  6%|▋         | 61/967 [00:18<04:45,  3.17it/s]  2%|▏         | 15/967 [00:04<08:28,  1.87it/s]  2%|▏         | 20/967 [00:04<04:19,  3.65it/s]  6%|▋         | 62/967 [00:19<06:34,  2.30it/s]  2%|▏         | 23/967 [00:05<03:38,  4.31it/s]  7%|▋         | 63/967 [00:20<08:19,  1.81it/s]  7%|▋         | 68/967 [00:21<04:16,  3.50it/s]  3%|▎         | 25/967 [00:06<05:03,  3.10it/s]  7%|▋         | 71/967 [00:21<03:32,  4.22it/s]  3%|▎         | 26/967 [00:07<07:03,  2.22it/s]  8%|▊         | 73/967 [00:22<04:40,  3.19it/s]  3%|▎         | 27/967 [00:09<09:04,  1.73it/s]  8%|▊         | 74/967 [00:23<06:25,  2.32it/s]  3%|▎         | 32/967 [00:09<04:39,  3.34it/s]  4%|▎         | 35/967 [00:09<03:52,  4.01it/s]  8%|▊         | 75/967 [00:24<08:10,  1.82it/s]  8%|▊         | 80/967 [00:25<04:12,  3.52it/s]  4%|▍         | 37/967 [00:11<05:08,  3.02it/s]  9%|▊         | 83/967 [00:25<03:29,  4.22it/s]  4%|▍         | 38/967 [00:12<07:03,  2.20it/s]  9%|▉         | 85/967 [00:26<04:37,  3.18it/s]  9%|▉         | 86/967 [00:27<06:21,  2.31it/s]  4%|▍         | 39/967 [00:13<09:05,  1.70it/s]  5%|▍         | 44/967 [00:13<04:41,  3.28it/s]  5%|▍         | 47/967 [00:14<03:54,  3.93it/s]  9%|▉         | 87/967 [00:29<08:08,  1.80it/s] 10%|▉         | 92/967 [00:29<04:11,  3.49it/s] 10%|▉         | 95/967 [00:29<03:27,  4.21it/s]  5%|▌         | 49/967 [00:15<05:09,  2.96it/s] 10%|█         | 97/967 [00:30<04:34,  3.17it/s]  5%|▌         | 50/967 [00:16<07:02,  2.17it/s] 10%|█         | 98/967 [00:32<06:16,  2.31it/s]  5%|▌         | 51/967 [00:17<09:02,  1.69it/s]  6%|▌         | 56/967 [00:18<04:38,  3.27it/s] 10%|█         | 99/967 [00:33<07:57,  1.82it/s]  6%|▌         | 59/967 [00:18<03:50,  3.94it/s] 11%|█         | 104/967 [00:33<04:06,  3.51it/s] 11%|█         | 107/967 [00:33<03:25,  4.18it/s]  6%|▋         | 61/967 [00:19<05:10,  2.92it/s] 11%|█▏        | 109/967 [00:35<04:36,  3.10it/s]  6%|▋         | 62/967 [00:21<07:06,  2.12it/s] 11%|█▏        | 110/967 [00:36<06:24,  2.23it/s]  7%|▋         | 63/967 [00:22<08:59,  1.68it/s]  7%|▋         | 68/967 [00:22<04:36,  3.25it/s] 11%|█▏        | 111/967 [00:37<08:11,  1.74it/s]  7%|▋         | 71/967 [00:23<03:48,  3.92it/s] 12%|█▏        | 116/967 [00:37<04:12,  3.37it/s] 12%|█▏        | 119/967 [00:38<03:29,  4.05it/s]  8%|▊         | 73/967 [00:24<05:12,  2.86it/s] 13%|█▎        | 121/967 [00:39<04:56,  2.85it/s]  8%|▊         | 74/967 [00:25<07:04,  2.10it/s] 13%|█▎        | 122/967 [00:40<06:45,  2.08it/s]  8%|▊         | 75/967 [00:26<08:54,  1.67it/s]  8%|▊         | 80/967 [00:27<04:37,  3.20it/s] 13%|█▎        | 123/967 [00:42<08:37,  1.63it/s]  9%|▊         | 83/967 [00:27<03:48,  3.86it/s] 13%|█▎        | 128/967 [00:42<04:24,  3.18it/s] 14%|█▎        | 131/967 [00:42<03:36,  3.86it/s]  9%|▉         | 85/967 [00:28<04:58,  2.95it/s] 14%|█▍        | 133/967 [00:44<04:49,  2.88it/s]  9%|▉         | 86/967 [00:30<06:55,  2.12it/s] 14%|█▍        | 134/967 [00:45<06:37,  2.10it/s]  9%|▉         | 87/967 [00:31<08:55,  1.64it/s] 10%|▉         | 92/967 [00:31<04:32,  3.21it/s] 14%|█▍        | 135/967 [00:46<08:26,  1.64it/s] 10%|▉         | 95/967 [00:32<03:43,  3.91it/s] 14%|█▍        | 140/967 [00:47<04:21,  3.16it/s] 15%|█▍        | 143/967 [00:47<03:35,  3.82it/s] 10%|█         | 97/967 [00:33<04:54,  2.96it/s] 15%|█▍        | 145/967 [00:48<04:51,  2.82it/s] 10%|█         | 98/967 [00:34<06:32,  2.21it/s] 15%|█▌        | 146/967 [00:50<06:38,  2.06it/s] 10%|█         | 99/967 [00:35<08:16,  1.75it/s] 11%|█         | 104/967 [00:36<04:13,  3.40it/s] 11%|█         | 107/967 [00:36<03:29,  4.11it/s] 15%|█▌        | 147/967 [00:51<09:08,  1.50it/s] 16%|█▌        | 152/967 [00:52<04:38,  2.93it/s] 11%|█▏        | 109/967 [00:37<04:41,  3.05it/s] 16%|█▌        | 155/967 [00:52<03:46,  3.59it/s] 11%|█▏        | 110/967 [00:38<06:33,  2.18it/s] 16%|█▌        | 157/967 [00:53<05:13,  2.58it/s] 11%|█▏        | 111/967 [00:40<08:13,  1.73it/s] 12%|█▏        | 116/967 [00:40<04:11,  3.38it/s] 12%|█▏        | 119/967 [00:40<03:27,  4.08it/s] 16%|█▋        | 158/967 [00:55<07:33,  1.78it/s] 13%|█▎        | 121/967 [00:41<04:31,  3.12it/s] 16%|█▋        | 159/967 [00:57<09:56,  1.35it/s] 17%|█▋        | 164/967 [00:57<04:59,  2.68it/s] 13%|█▎        | 122/967 [00:43<06:16,  2.24it/s] 17%|█▋        | 167/967 [00:57<03:59,  3.34it/s] 13%|█▎        | 123/967 [00:44<08:22,  1.68it/s] 13%|█▎        | 128/967 [00:44<04:15,  3.28it/s] 17%|█▋        | 169/967 [00:59<05:32,  2.40it/s] 14%|█▎        | 131/967 [00:45<03:29,  3.98it/s] 14%|█▍        | 133/967 [00:46<04:40,  2.97it/s] 18%|█▊        | 170/967 [01:01<07:48,  1.70it/s] 14%|█▍        | 134/967 [00:47<06:36,  2.10it/s] 18%|█▊        | 171/967 [01:02<10:04,  1.32it/s] 18%|█▊        | 176/967 [01:03<05:01,  2.62it/s] 19%|█▊        | 179/967 [01:03<04:00,  3.27it/s] 14%|█▍        | 135/967 [00:49<08:33,  1.62it/s] 14%|█▍        | 140/967 [00:49<04:22,  3.16it/s] 15%|█▍        | 143/967 [00:49<03:34,  3.83it/s] 19%|█▊        | 181/967 [01:05<05:31,  2.37it/s] 15%|█▍        | 145/967 [00:51<04:49,  2.84it/s] 19%|█▉        | 182/967 [01:06<07:44,  1.69it/s] 15%|█▌        | 146/967 [00:52<06:42,  2.04it/s] 15%|█▌        | 147/967 [00:53<08:36,  1.59it/s] 19%|█▉        | 183/967 [01:08<09:59,  1.31it/s] 16%|█▌        | 152/967 [00:54<04:22,  3.11it/s] 19%|█▉        | 188/967 [01:08<05:01,  2.58it/s] 16%|█▌        | 155/967 [00:54<03:35,  3.77it/s] 20%|█▉        | 191/967 [01:09<04:02,  3.20it/s] 16%|█▌        | 157/967 [00:55<04:48,  2.81it/s] 20%|█▉        | 193/967 [01:10<05:30,  2.34it/s] 16%|█▋        | 158/967 [00:57<06:37,  2.04it/s] 20%|██        | 194/967 [01:12<07:38,  1.69it/s] 16%|█▋        | 159/967 [00:58<08:28,  1.59it/s] 17%|█▋        | 164/967 [00:58<04:22,  3.06it/s] 17%|█▋        | 167/967 [00:59<03:36,  3.70it/s] 20%|██        | 195/967 [01:13<09:47,  1.31it/s] 21%|██        | 200/967 [01:14<04:55,  2.60it/s] 21%|██        | 203/967 [01:14<03:57,  3.22it/s] 17%|█▋        | 169/967 [01:00<04:59,  2.67it/s] 21%|██        | 205/967 [01:16<05:22,  2.36it/s] 18%|█▊        | 170/967 [01:02<06:45,  1.96it/s] 18%|█▊        | 171/967 [01:03<08:34,  1.55it/s] 21%|██▏       | 206/967 [01:18<07:29,  1.69it/s] 18%|█▊        | 176/967 [01:03<04:22,  3.01it/s] 19%|█▊        | 179/967 [01:04<03:37,  3.62it/s] 21%|██▏       | 207/967 [01:19<09:38,  1.31it/s] 19%|█▊        | 181/967 [01:05<04:46,  2.74it/s] 22%|██▏       | 212/967 [01:20<04:54,  2.57it/s] 22%|██▏       | 215/967 [01:20<03:57,  3.17it/s] 19%|█▉        | 182/967 [01:06<06:32,  2.00it/s] 22%|██▏       | 217/967 [01:22<05:22,  2.33it/s] 19%|█▉        | 183/967 [01:08<08:19,  1.57it/s] 19%|█▉        | 188/967 [01:08<04:13,  3.07it/s] 20%|█▉        | 191/967 [01:08<03:30,  3.69it/s] 23%|██▎       | 218/967 [01:23<07:25,  1.68it/s] 20%|█▉        | 193/967 [01:10<04:38,  2.78it/s] 23%|██▎       | 219/967 [01:25<09:30,  1.31it/s] 23%|██▎       | 224/967 [01:25<04:46,  2.59it/s] 20%|██        | 194/967 [01:11<06:24,  2.01it/s] 23%|██▎       | 227/967 [01:26<03:51,  3.20it/s] 20%|██        | 195/967 [01:12<08:09,  1.58it/s] 24%|██▎       | 229/967 [01:27<05:12,  2.36it/s] 21%|██        | 200/967 [01:13<04:12,  3.04it/s] 21%|██        | 203/967 [01:13<03:26,  3.70it/s] 24%|██▍       | 230/967 [01:29<07:15,  1.69it/s] 21%|██        | 205/967 [01:15<04:34,  2.77it/s] 21%|██▏       | 206/967 [01:16<06:18,  2.01it/s] 24%|██▍       | 231/967 [01:30<09:18,  1.32it/s] 24%|██▍       | 236/967 [01:31<04:40,  2.60it/s] 25%|██▍       | 239/967 [01:31<03:46,  3.22it/s] 21%|██▏       | 207/967 [01:17<08:03,  1.57it/s] 22%|██▏       | 212/967 [01:18<04:07,  3.05it/s] 22%|██▏       | 215/967 [01:18<03:25,  3.67it/s] 25%|██▍       | 241/967 [01:33<05:07,  2.36it/s] 22%|██▏       | 217/967 [01:19<04:32,  2.76it/s] 25%|██▌       | 242/967 [01:34<07:06,  1.70it/s] 23%|██▎       | 218/967 [01:21<06:14,  2.00it/s] 25%|██▌       | 243/967 [01:36<09:09,  1.32it/s] 26%|██▌       | 248/967 [01:36<04:35,  2.61it/s] 23%|██▎       | 219/967 [01:22<07:57,  1.57it/s] 23%|██▎       | 224/967 [01:22<04:02,  3.06it/s] 26%|██▌       | 251/967 [01:37<03:42,  3.22it/s] 23%|██▎       | 227/967 [01:23<03:18,  3.72it/s] 26%|██▌       | 253/967 [01:38<05:01,  2.37it/s] 24%|██▎       | 229/967 [01:24<04:25,  2.78it/s] 24%|██▍       | 230/967 [01:25<06:05,  2.02it/s] 26%|██▋       | 254/967 [01:40<06:58,  1.70it/s] 24%|██▍       | 231/967 [01:27<07:45,  1.58it/s] 26%|██▋       | 255/967 [01:42<08:56,  1.33it/s] 24%|██▍       | 236/967 [01:27<03:59,  3.05it/s] 27%|██▋       | 260/967 [01:42<04:30,  2.62it/s] 25%|██▍       | 239/967 [01:27<03:15,  3.72it/s] 27%|██▋       | 263/967 [01:42<03:37,  3.24it/s] 25%|██▍       | 241/967 [01:29<04:34,  2.64it/s] 27%|██▋       | 265/967 [01:44<04:55,  2.38it/s] 25%|██▌       | 242/967 [01:31<06:39,  1.82it/s] 28%|██▊       | 266/967 [01:46<06:51,  1.70it/s] 25%|██▌       | 243/967 [01:32<08:45,  1.38it/s] 26%|██▌       | 248/967 [01:32<04:24,  2.72it/s] 28%|██▊       | 267/967 [01:47<08:49,  1.32it/s] 26%|██▌       | 251/967 [01:33<03:33,  3.35it/s] 28%|██▊       | 272/967 [01:47<04:28,  2.59it/s] 28%|██▊       | 274/967 [01:48<03:32,  3.26it/s] 28%|██▊       | 275/967 [01:48<03:39,  3.16it/s] 26%|██▌       | 253/967 [01:35<04:54,  2.42it/s] 29%|██▊       | 277/967 [01:50<05:16,  2.18it/s] 26%|██▋       | 254/967 [01:36<06:53,  1.73it/s] 29%|██▊       | 278/967 [01:51<07:31,  1.53it/s] 26%|██▋       | 255/967 [01:38<08:52,  1.34it/s] 27%|██▋       | 260/967 [01:38<04:28,  2.64it/s] 29%|██▉       | 279/967 [01:53<09:38,  1.19it/s] 27%|██▋       | 263/967 [01:39<03:37,  3.24it/s] 29%|██▉       | 284/967 [01:53<04:27,  2.56it/s] 30%|██▉       | 287/967 [01:54<03:31,  3.21it/s] 27%|██▋       | 265/967 [01:40<04:54,  2.38it/s] 30%|██▉       | 289/967 [01:55<04:51,  2.32it/s] 28%|██▊       | 266/967 [01:42<06:49,  1.71it/s] 30%|██▉       | 290/967 [01:57<06:47,  1.66it/s] 28%|██▊       | 267/967 [01:43<08:47,  1.33it/s] 28%|██▊       | 272/967 [01:44<04:25,  2.62it/s] 30%|███       | 291/967 [01:58<08:43,  1.29it/s] 28%|██▊       | 275/967 [01:44<03:34,  3.23it/s] 31%|███       | 296/967 [01:59<04:19,  2.58it/s] 31%|███       | 299/967 [01:59<03:32,  3.14it/s] 29%|██▊       | 277/967 [01:46<04:50,  2.37it/s] 31%|███       | 301/967 [02:01<04:46,  2.33it/s] 29%|██▊       | 278/967 [01:47<06:44,  1.70it/s] 31%|███       | 302/967 [02:02<06:36,  1.68it/s] 29%|██▉       | 279/967 [01:49<08:39,  1.33it/s] 29%|██▉       | 284/967 [01:49<04:20,  2.62it/s] 31%|███▏      | 303/967 [02:04<08:27,  1.31it/s] 30%|██▉       | 287/967 [01:50<03:30,  3.23it/s] 32%|███▏      | 308/967 [02:04<04:20,  2.53it/s] 32%|███▏      | 311/967 [02:05<03:34,  3.06it/s] 30%|██▉       | 289/967 [01:51<04:46,  2.37it/s] 32%|███▏      | 313/967 [02:07<04:44,  2.30it/s] 30%|██▉       | 290/967 [01:53<06:37,  1.70it/s] 32%|███▏      | 314/967 [02:08<06:31,  1.67it/s] 30%|███       | 291/967 [01:54<08:29,  1.33it/s] 31%|███       | 296/967 [01:55<04:16,  2.62it/s] 31%|███       | 299/967 [01:55<03:26,  3.23it/s] 33%|███▎      | 315/967 [02:10<08:19,  1.31it/s] 33%|███▎      | 320/967 [02:10<04:16,  2.52it/s] 33%|███▎      | 323/967 [02:11<03:30,  3.06it/s] 31%|███       | 301/967 [01:57<04:41,  2.37it/s] 34%|███▎      | 325/967 [02:12<04:37,  2.31it/s] 31%|███       | 302/967 [01:58<06:31,  1.70it/s] 34%|███▎      | 326/967 [02:14<06:22,  1.68it/s] 31%|███▏      | 303/967 [02:00<08:21,  1.32it/s] 32%|███▏      | 308/967 [02:00<04:15,  2.58it/s] 32%|███▏      | 311/967 [02:01<03:25,  3.20it/s] 34%|███▍      | 327/967 [02:15<08:07,  1.31it/s] 34%|███▍      | 332/967 [02:16<04:10,  2.54it/s] 35%|███▍      | 335/967 [02:17<03:26,  3.06it/s] 32%|███▏      | 313/967 [02:02<04:37,  2.35it/s] 35%|███▍      | 337/967 [02:18<04:34,  2.30it/s] 32%|███▏      | 314/967 [02:04<06:25,  1.69it/s] 35%|███▍      | 338/967 [02:20<06:16,  1.67it/s] 33%|███▎      | 315/967 [02:06<08:14,  1.32it/s] 33%|███▎      | 320/967 [02:06<04:08,  2.61it/s] 33%|███▎      | 323/967 [02:06<03:20,  3.21it/s] 35%|███▌      | 339/967 [02:21<07:59,  1.31it/s] 36%|███▌      | 344/967 [02:22<04:06,  2.53it/s] 36%|███▌      | 346/967 [02:22<03:14,  3.19it/s] 36%|███▌      | 347/967 [02:22<03:26,  3.00it/s] 34%|███▎      | 325/967 [02:08<04:31,  2.37it/s] 36%|███▌      | 349/967 [02:24<04:48,  2.14it/s] 34%|███▎      | 326/967 [02:10<06:16,  1.70it/s] 36%|███▌      | 350/967 [02:25<06:45,  1.52it/s] 34%|███▍      | 327/967 [02:11<08:03,  1.32it/s] 34%|███▍      | 332/967 [02:12<04:02,  2.61it/s] 35%|███▍      | 335/967 [02:12<03:15,  3.23it/s] 36%|███▋      | 351/967 [02:27<08:37,  1.19it/s] 37%|███▋      | 356/967 [02:27<04:05,  2.49it/s] 37%|███▋      | 359/967 [02:28<03:19,  3.05it/s] 35%|███▍      | 337/967 [02:14<04:26,  2.37it/s] 37%|███▋      | 361/967 [02:30<04:25,  2.28it/s] 35%|███▍      | 338/967 [02:15<06:08,  1.70it/s] 37%|███▋      | 362/967 [02:31<06:05,  1.65it/s] 35%|███▌      | 339/967 [02:17<07:52,  1.33it/s] 36%|███▌      | 344/967 [02:17<03:57,  2.62it/s] 36%|███▌      | 347/967 [02:18<03:11,  3.24it/s] 38%|███▊      | 363/967 [02:33<07:45,  1.30it/s] 38%|███▊      | 368/967 [02:33<03:55,  2.54it/s] 36%|███▌      | 349/967 [02:19<04:19,  2.38it/s] 38%|███▊      | 371/967 [02:34<03:13,  3.08it/s] 36%|███▌      | 350/967 [02:21<06:01,  1.71it/s] 39%|███▊      | 373/967 [02:35<04:18,  2.30it/s] 36%|███▋      | 351/967 [02:22<07:45,  1.32it/s] 39%|███▊      | 374/967 [02:37<05:54,  1.67it/s] 37%|███▋      | 356/967 [02:23<03:53,  2.62it/s] 37%|███▋      | 359/967 [02:23<03:08,  3.23it/s] 39%|███▉      | 375/967 [02:38<07:32,  1.31it/s] 39%|███▉      | 380/967 [02:39<03:51,  2.54it/s] 37%|███▋      | 361/967 [02:25<04:14,  2.38it/s] 40%|███▉      | 383/967 [02:39<03:10,  3.07it/s] 37%|███▋      | 362/967 [02:26<05:52,  1.72it/s] 40%|███▉      | 385/967 [02:41<04:11,  2.32it/s] 38%|███▊      | 363/967 [02:28<07:33,  1.33it/s] 40%|███▉      | 386/967 [02:42<05:45,  1.68it/s] 38%|███▊      | 368/967 [02:28<03:49,  2.61it/s] 38%|███▊      | 371/967 [02:29<03:05,  3.21it/s] 40%|████      | 387/967 [02:44<07:19,  1.32it/s] 41%|████      | 392/967 [02:45<03:45,  2.55it/s] 39%|███▊      | 373/967 [02:30<04:11,  2.36it/s] 41%|████      | 395/967 [02:45<03:06,  3.07it/s] 39%|███▊      | 374/967 [02:32<05:48,  1.70it/s] 41%|████      | 397/967 [02:47<04:06,  2.31it/s] 39%|███▉      | 375/967 [02:33<07:26,  1.33it/s] 41%|████      | 398/967 [02:48<05:38,  1.68it/s] 39%|███▉      | 380/967 [02:34<03:44,  2.62it/s] 40%|███▉      | 383/967 [02:34<03:00,  3.24it/s] 41%|████▏     | 399/967 [02:50<07:11,  1.32it/s] 42%|████▏     | 404/967 [02:50<03:42,  2.54it/s] 40%|███▉      | 385/967 [02:36<04:04,  2.38it/s] 42%|████▏     | 407/967 [02:51<03:02,  3.07it/s] 40%|███▉      | 386/967 [02:37<05:40,  1.71it/s] 42%|████▏     | 409/967 [02:52<04:01,  2.31it/s] 40%|████      | 387/967 [02:39<07:15,  1.33it/s] 41%|████      | 392/967 [02:39<03:39,  2.63it/s] 42%|████▏     | 410/967 [02:54<05:31,  1.68it/s] 41%|████      | 394/967 [02:39<02:53,  3.31it/s] 41%|████      | 395/967 [02:40<02:58,  3.20it/s] 43%|████▎     | 411/967 [02:55<07:03,  1.31it/s] 41%|████      | 397/967 [02:41<04:17,  2.22it/s] 43%|████▎     | 416/967 [02:56<03:36,  2.54it/s] 43%|████▎     | 419/967 [02:56<02:58,  3.07it/s] 41%|████      | 398/967 [02:43<06:06,  1.55it/s] 44%|████▎     | 421/967 [02:58<03:56,  2.31it/s] 41%|████▏     | 399/967 [02:45<07:49,  1.21it/s] 42%|████▏     | 404/967 [02:45<03:36,  2.60it/s] 44%|████▎     | 422/967 [03:00<05:23,  1.69it/s] 42%|████▏     | 407/967 [02:45<02:52,  3.25it/s] 44%|████▎     | 423/967 [03:01<06:52,  1.32it/s] 42%|████▏     | 409/967 [02:47<03:55,  2.37it/s] 44%|████▍     | 428/967 [03:02<03:30,  2.56it/s] 45%|████▍     | 431/967 [03:02<02:53,  3.08it/s] 42%|████▏     | 410/967 [02:48<05:29,  1.69it/s] 45%|████▍     | 433/967 [03:04<03:51,  2.31it/s] 43%|████▎     | 411/967 [02:50<07:04,  1.31it/s] 43%|████▎     | 416/967 [02:50<03:29,  2.62it/s] 43%|████▎     | 418/967 [02:51<02:45,  3.31it/s] 45%|████▍     | 434/967 [03:05<05:17,  1.68it/s] 43%|████▎     | 420/967 [02:51<02:29,  3.67it/s] 45%|████▍     | 435/967 [03:07<06:46,  1.31it/s] 44%|████▎     | 421/967 [02:52<04:18,  2.11it/s] 46%|████▌     | 440/967 [03:07<03:27,  2.54it/s] 46%|████▌     | 443/967 [03:08<02:50,  3.08it/s] 44%|████▎     | 422/967 [02:54<06:05,  1.49it/s] 46%|████▌     | 445/967 [03:09<03:44,  2.33it/s] 44%|████▎     | 423/967 [02:56<07:44,  1.17it/s] 44%|████▍     | 428/967 [02:56<03:30,  2.56it/s] 45%|████▍     | 431/967 [02:56<02:46,  3.22it/s] 46%|████▌     | 446/967 [03:11<05:08,  1.69it/s] 45%|████▍     | 433/967 [02:58<03:47,  2.35it/s] 46%|████▌     | 447/967 [03:13<06:32,  1.32it/s] 47%|████▋     | 452/967 [03:13<03:21,  2.56it/s] 47%|████▋     | 455/967 [03:14<02:45,  3.09it/s] 45%|████▍     | 434/967 [03:00<05:17,  1.68it/s] 47%|████▋     | 457/967 [03:15<03:40,  2.32it/s] 45%|████▍     | 435/967 [03:01<06:47,  1.31it/s] 46%|████▌     | 440/967 [03:02<03:23,  2.59it/s] 46%|████▌     | 443/967 [03:02<02:43,  3.21it/s] 47%|████▋     | 458/967 [03:17<05:02,  1.69it/s] 46%|████▌     | 445/967 [03:04<03:40,  2.37it/s] 47%|████▋     | 459/967 [03:18<06:25,  1.32it/s] 48%|████▊     | 464/967 [03:19<03:16,  2.55it/s] 48%|████▊     | 466/967 [03:19<02:35,  3.22it/s] 48%|████▊     | 467/967 [03:19<02:44,  3.03it/s] 46%|████▌     | 446/967 [03:05<05:06,  1.70it/s] 49%|████▊     | 469/967 [03:21<03:50,  2.16it/s] 46%|████▌     | 447/967 [03:07<06:33,  1.32it/s] 47%|████▋     | 452/967 [03:07<03:16,  2.62it/s] 47%|████▋     | 455/967 [03:08<02:38,  3.24it/s] 49%|████▊     | 470/967 [03:22<05:24,  1.53it/s] 47%|████▋     | 457/967 [03:09<03:34,  2.38it/s] 49%|████▊     | 471/967 [03:24<06:53,  1.20it/s] 49%|████▉     | 476/967 [03:24<03:15,  2.51it/s] 50%|████▉     | 479/967 [03:25<02:38,  3.08it/s] 47%|████▋     | 458/967 [03:11<04:57,  1.71it/s] 50%|████▉     | 481/967 [03:26<03:31,  2.30it/s] 47%|████▋     | 459/967 [03:12<06:21,  1.33it/s] 48%|████▊     | 464/967 [03:13<03:11,  2.63it/s] 48%|████▊     | 467/967 [03:13<02:34,  3.24it/s] 50%|████▉     | 482/967 [03:28<04:50,  1.67it/s] 49%|████▊     | 469/967 [03:15<03:28,  2.39it/s] 50%|████▉     | 483/967 [03:30<06:10,  1.31it/s] 50%|█████     | 488/967 [03:30<03:07,  2.56it/s] 51%|█████     | 490/967 [03:30<02:27,  3.23it/s] 51%|█████     | 491/967 [03:31<02:37,  3.03it/s] 49%|████▊     | 470/967 [03:16<04:49,  1.72it/s] 51%|█████     | 493/967 [03:32<03:40,  2.15it/s] 49%|████▊     | 471/967 [03:18<06:12,  1.33it/s] 49%|████▉     | 476/967 [03:18<03:06,  2.63it/s] 50%|████▉     | 479/967 [03:19<02:30,  3.25it/s] 51%|█████     | 494/967 [03:34<05:10,  1.52it/s] 50%|████▉     | 481/967 [03:20<03:23,  2.38it/s] 51%|█████     | 495/967 [03:35<06:35,  1.19it/s] 52%|█████▏    | 500/967 [03:36<03:06,  2.51it/s] 52%|█████▏    | 502/967 [03:36<02:24,  3.21it/s] 52%|█████▏    | 503/967 [03:36<02:33,  3.02it/s] 50%|████▉     | 482/967 [03:22<04:43,  1.71it/s] 52%|█████▏    | 505/967 [03:38<03:35,  2.14it/s] 50%|████▉     | 483/967 [03:23<06:04,  1.33it/s] 50%|█████     | 488/967 [03:24<03:02,  2.62it/s] 51%|█████     | 491/967 [03:24<02:27,  3.24it/s] 52%|█████▏    | 506/967 [03:39<05:04,  1.51it/s] 51%|█████     | 493/967 [03:26<03:18,  2.39it/s] 52%|█████▏    | 507/967 [03:41<06:29,  1.18it/s] 53%|█████▎    | 512/967 [03:41<03:01,  2.51it/s] 51%|█████     | 494/967 [03:27<04:35,  1.72it/s] 53%|█████▎    | 515/967 [03:42<02:26,  3.08it/s] 51%|█████     | 495/967 [03:29<05:54,  1.33it/s] 53%|█████▎    | 517/967 [03:44<03:16,  2.29it/s] 52%|█████▏    | 500/967 [03:29<02:59,  2.61it/s] 52%|█████▏    | 503/967 [03:30<02:24,  3.22it/s] 54%|█████▎    | 518/967 [03:45<04:30,  1.66it/s] 52%|█████▏    | 505/967 [03:31<03:14,  2.37it/s] 54%|█████▎    | 519/967 [03:47<05:45,  1.30it/s] 54%|█████▍    | 524/967 [03:47<02:54,  2.55it/s] 52%|█████▏    | 506/967 [03:33<04:29,  1.71it/s] 54%|█████▍    | 527/967 [03:48<02:22,  3.08it/s] 52%|█████▏    | 507/967 [03:34<05:45,  1.33it/s] 55%|█████▍    | 529/967 [03:49<03:09,  2.32it/s] 53%|█████▎    | 512/967 [03:35<02:53,  2.63it/s] 53%|█████▎    | 515/967 [03:35<02:19,  3.24it/s] 55%|█████▍    | 530/967 [03:51<04:20,  1.68it/s] 53%|█████▎    | 517/967 [03:37<03:08,  2.38it/s] 55%|█████▍    | 531/967 [03:52<05:31,  1.32it/s] 55%|█████▌    | 536/967 [03:53<02:48,  2.55it/s] 54%|█████▎    | 518/967 [03:38<04:21,  1.71it/s] 56%|█████▌    | 539/967 [03:53<02:18,  3.09it/s] 54%|█████▎    | 519/967 [03:40<05:35,  1.33it/s] 56%|█████▌    | 541/967 [03:55<03:03,  2.33it/s] 54%|█████▍    | 524/967 [03:40<02:48,  2.64it/s] 54%|█████▍    | 527/967 [03:41<02:14,  3.26it/s] 56%|█████▌    | 542/967 [03:56<04:11,  1.69it/s] 55%|█████▍    | 529/967 [03:42<03:02,  2.40it/s] 56%|█████▌    | 543/967 [03:58<05:20,  1.32it/s] 57%|█████▋    | 548/967 [03:58<02:43,  2.56it/s] 55%|█████▍    | 530/967 [03:44<04:13,  1.72it/s] 57%|█████▋    | 551/967 [03:59<02:14,  3.09it/s] 55%|█████▍    | 531/967 [03:46<05:25,  1.34it/s] 55%|█████▌    | 536/967 [03:46<02:42,  2.64it/s] 57%|█████▋    | 553/967 [04:01<02:57,  2.33it/s] 56%|█████▌    | 539/967 [03:46<02:11,  3.26it/s] 57%|█████▋    | 554/967 [04:02<04:04,  1.69it/s] 56%|█████▌    | 541/967 [03:48<02:58,  2.39it/s] 57%|█████▋    | 555/967 [04:04<05:10,  1.33it/s] 56%|█████▌    | 542/967 [03:50<04:07,  1.72it/s] 58%|█████▊    | 560/967 [04:04<02:38,  2.56it/s] 58%|█████▊    | 562/967 [04:04<02:05,  3.23it/s] 58%|█████▊    | 563/967 [04:05<02:12,  3.05it/s] 56%|█████▌    | 543/967 [03:51<05:18,  1.33it/s] 57%|█████▋    | 548/967 [03:51<02:39,  2.63it/s] 57%|█████▋    | 550/967 [03:52<02:05,  3.32it/s] 58%|█████▊    | 565/967 [04:06<03:04,  2.18it/s] 57%|█████▋    | 552/967 [03:52<01:53,  3.67it/s] 59%|█████▊    | 566/967 [04:08<04:18,  1.55it/s] 57%|█████▋    | 553/967 [03:53<03:14,  2.12it/s] 59%|█████▊    | 567/967 [04:09<05:29,  1.21it/s] 57%|█████▋    | 554/967 [03:55<04:35,  1.50it/s] 59%|█████▉    | 572/967 [04:10<02:35,  2.54it/s] 59%|█████▉    | 575/967 [04:10<02:06,  3.10it/s] 57%|█████▋    | 555/967 [03:57<05:49,  1.18it/s] 58%|█████▊    | 560/967 [03:57<02:38,  2.57it/s] 60%|█████▉    | 577/967 [04:12<02:47,  2.32it/s] 58%|█████▊    | 563/967 [03:57<02:04,  3.23it/s] 60%|█████▉    | 578/967 [04:13<03:51,  1.68it/s] 58%|█████▊    | 565/967 [03:59<02:50,  2.35it/s] 60%|█████▉    | 579/967 [04:15<04:55,  1.31it/s] 59%|█████▊    | 566/967 [04:01<03:57,  1.69it/s] 60%|██████    | 584/967 [04:15<02:28,  2.57it/s] 61%|██████    | 587/967 [04:16<02:01,  3.12it/s] 59%|█████▊    | 567/967 [04:02<05:05,  1.31it/s] 59%|█████▉    | 572/967 [04:03<02:32,  2.60it/s] 59%|█████▉    | 574/967 [04:03<01:59,  3.28it/s] 61%|██████    | 589/967 [04:17<02:41,  2.34it/s] 59%|█████▉    | 575/967 [04:03<02:03,  3.18it/s] 61%|██████    | 590/967 [04:19<03:42,  1.70it/s] 60%|█████▉    | 577/967 [04:05<02:56,  2.21it/s] 61%|██████    | 591/967 [04:21<04:42,  1.33it/s] 60%|█████▉    | 578/967 [04:06<04:12,  1.54it/s] 62%|██████▏   | 596/967 [04:21<02:23,  2.58it/s] 62%|██████▏   | 599/967 [04:22<01:57,  3.12it/s] 60%|█████▉    | 579/967 [04:08<05:22,  1.20it/s] 60%|██████    | 584/967 [04:08<02:28,  2.59it/s] 62%|██████▏   | 601/967 [04:23<02:35,  2.36it/s] 61%|██████    | 587/967 [04:09<01:57,  3.24it/s] 62%|██████▏   | 602/967 [04:25<03:33,  1.71it/s] 61%|██████    | 589/967 [04:10<02:40,  2.36it/s] 62%|██████▏   | 603/967 [04:26<04:32,  1.34it/s] 61%|██████    | 590/967 [04:12<03:42,  1.69it/s] 63%|██████▎   | 608/967 [04:27<02:19,  2.58it/s] 63%|██████▎   | 611/967 [04:27<01:54,  3.11it/s] 61%|██████    | 591/967 [04:13<04:45,  1.32it/s] 62%|██████▏   | 596/967 [04:14<02:21,  2.63it/s] 62%|██████▏   | 599/967 [04:14<01:53,  3.25it/s] 63%|██████▎   | 613/967 [04:29<02:30,  2.35it/s] 62%|██████▏   | 601/967 [04:16<02:33,  2.38it/s] 63%|██████▎   | 614/967 [04:30<03:27,  1.70it/s] 62%|██████▏   | 602/967 [04:17<03:34,  1.70it/s] 64%|██████▎   | 615/967 [04:32<04:24,  1.33it/s] 64%|██████▍   | 620/967 [04:32<02:15,  2.57it/s] 64%|██████▍   | 623/967 [04:33<01:50,  3.11it/s] 62%|██████▏   | 603/967 [04:19<04:34,  1.32it/s] 63%|██████▎   | 608/967 [04:19<02:17,  2.62it/s] 63%|██████▎   | 610/967 [04:19<01:48,  3.30it/s] 63%|██████▎   | 611/967 [04:20<01:51,  3.20it/s] 65%|██████▍   | 625/967 [04:34<02:26,  2.34it/s] 63%|██████▎   | 613/967 [04:21<02:40,  2.21it/s] 65%|██████▍   | 626/967 [04:36<03:19,  1.71it/s] 63%|██████▎   | 614/967 [04:23<03:48,  1.54it/s] 65%|██████▍   | 627/967 [04:37<04:14,  1.34it/s] 65%|██████▌   | 632/967 [04:38<02:09,  2.58it/s] 66%|██████▌   | 635/967 [04:38<01:46,  3.12it/s] 64%|██████▎   | 615/967 [04:24<04:52,  1.20it/s] 64%|██████▍   | 620/967 [04:25<02:13,  2.59it/s] 64%|██████▍   | 623/967 [04:25<01:46,  3.24it/s] 66%|██████▌   | 637/967 [04:40<02:20,  2.35it/s] 65%|██████▍   | 625/967 [04:27<02:24,  2.37it/s] 66%|██████▌   | 638/967 [04:41<03:12,  1.70it/s] 65%|██████▍   | 626/967 [04:28<03:21,  1.70it/s] 66%|██████▌   | 639/967 [04:43<04:05,  1.33it/s] 67%|██████▋   | 644/967 [04:43<02:05,  2.58it/s] 67%|██████▋   | 647/967 [04:44<01:42,  3.12it/s] 65%|██████▍   | 627/967 [04:30<04:17,  1.32it/s] 65%|██████▌   | 632/967 [04:30<02:08,  2.61it/s] 66%|██████▌   | 635/967 [04:31<01:42,  3.23it/s] 67%|██████▋   | 649/967 [04:46<02:15,  2.35it/s] 66%|██████▌   | 637/967 [04:32<02:18,  2.38it/s] 67%|██████▋   | 650/967 [04:47<03:05,  1.71it/s] 66%|██████▌   | 638/967 [04:34<03:11,  1.72it/s] 67%|██████▋   | 651/967 [04:49<03:56,  1.34it/s] 68%|██████▊   | 656/967 [04:49<02:00,  2.59it/s] 68%|██████▊   | 659/967 [04:50<01:38,  3.13it/s] 66%|██████▌   | 639/967 [04:35<04:06,  1.33it/s] 67%|██████▋   | 644/967 [04:36<02:02,  2.64it/s] 67%|██████▋   | 647/967 [04:36<01:38,  3.26it/s] 68%|██████▊   | 661/967 [04:51<02:09,  2.36it/s] 67%|██████▋   | 649/967 [04:38<02:12,  2.39it/s] 68%|██████▊   | 662/967 [04:53<02:58,  1.71it/s] 67%|██████▋   | 650/967 [04:39<03:04,  1.72it/s] 69%|██████▊   | 663/967 [04:54<03:46,  1.34it/s] 69%|██████▉   | 668/967 [04:55<01:55,  2.58it/s] 69%|██████▉   | 671/967 [04:55<01:34,  3.12it/s] 67%|██████▋   | 651/967 [04:41<03:55,  1.34it/s] 68%|██████▊   | 656/967 [04:41<01:57,  2.65it/s] 68%|██████▊   | 659/967 [04:42<01:34,  3.27it/s] 70%|██████▉   | 673/967 [04:57<02:05,  2.35it/s] 68%|██████▊   | 661/967 [04:43<02:06,  2.41it/s] 70%|██████▉   | 674/967 [04:58<02:51,  1.70it/s] 68%|██████▊   | 662/967 [04:45<02:55,  1.73it/s] 70%|██████▉   | 675/967 [05:00<03:39,  1.33it/s] 70%|███████   | 680/967 [05:00<01:51,  2.58it/s] 71%|███████   | 683/967 [05:01<01:31,  3.11it/s] 69%|██████▊   | 663/967 [04:46<03:46,  1.35it/s] 69%|██████▉   | 668/967 [04:47<01:52,  2.66it/s] 69%|██████▉   | 671/967 [04:47<01:30,  3.28it/s] 71%|███████   | 685/967 [05:02<02:00,  2.34it/s] 70%|██████▉   | 673/967 [04:49<02:02,  2.40it/s] 71%|███████   | 686/967 [05:04<02:45,  1.70it/s] 70%|██████▉   | 674/967 [04:50<02:49,  1.73it/s] 71%|███████   | 687/967 [05:05<03:29,  1.34it/s] 72%|███████▏  | 692/967 [05:06<01:46,  2.59it/s] 70%|██████▉   | 675/967 [04:52<03:37,  1.34it/s] 72%|███████▏  | 695/967 [05:06<01:26,  3.13it/s] 70%|███████   | 680/967 [04:52<01:48,  2.65it/s] 71%|███████   | 683/967 [04:53<01:26,  3.27it/s] 72%|███████▏  | 697/967 [05:08<01:55,  2.35it/s] 71%|███████   | 685/967 [04:54<01:56,  2.41it/s] 72%|███████▏  | 698/967 [05:09<02:37,  1.71it/s] 71%|███████   | 686/967 [04:56<02:41,  1.73it/s] 72%|███████▏  | 699/967 [05:11<03:20,  1.34it/s] 73%|███████▎  | 704/967 [05:11<01:41,  2.58it/s] 71%|███████   | 687/967 [04:57<03:27,  1.35it/s] 73%|███████▎  | 707/967 [05:12<01:23,  3.12it/s] 72%|███████▏  | 692/967 [04:58<01:43,  2.67it/s] 72%|███████▏  | 695/967 [04:58<01:22,  3.30it/s] 73%|███████▎  | 709/967 [05:14<01:49,  2.35it/s] 72%|███████▏  | 697/967 [05:00<01:51,  2.43it/s] 73%|███████▎  | 710/967 [05:15<02:30,  1.71it/s] 72%|███████▏  | 698/967 [05:01<02:34,  1.74it/s] 74%|███████▎  | 711/967 [05:17<03:11,  1.34it/s] 74%|███████▍  | 716/967 [05:17<01:37,  2.59it/s] 72%|███████▏  | 699/967 [05:03<03:18,  1.35it/s] 74%|███████▍  | 719/967 [05:18<01:19,  3.13it/s] 73%|███████▎  | 704/967 [05:03<01:39,  2.64it/s] 73%|███████▎  | 707/967 [05:04<01:19,  3.25it/s] 75%|███████▍  | 721/967 [05:19<01:44,  2.36it/s] 73%|███████▎  | 709/967 [05:05<01:47,  2.41it/s] 75%|███████▍  | 722/967 [05:21<02:23,  1.71it/s] 73%|███████▎  | 710/967 [05:07<02:28,  1.73it/s] 75%|███████▍  | 723/967 [05:22<03:02,  1.34it/s] 75%|███████▌  | 728/967 [05:23<01:32,  2.59it/s] 74%|███████▎  | 711/967 [05:08<03:09,  1.35it/s] 74%|███████▍  | 716/967 [05:09<01:34,  2.66it/s] 76%|███████▌  | 731/967 [05:23<01:15,  3.13it/s] 74%|███████▍  | 719/967 [05:09<01:15,  3.28it/s] 76%|███████▌  | 733/967 [05:25<01:39,  2.35it/s] 75%|███████▍  | 721/967 [05:11<01:41,  2.42it/s] 76%|███████▌  | 734/967 [05:26<02:16,  1.70it/s] 75%|███████▍  | 722/967 [05:12<02:20,  1.74it/s] 76%|███████▌  | 735/967 [05:28<02:53,  1.33it/s] 75%|███████▍  | 723/967 [05:14<02:59,  1.36it/s] 77%|███████▋  | 740/967 [05:28<01:28,  2.57it/s] 75%|███████▌  | 728/967 [05:14<01:29,  2.68it/s] 77%|███████▋  | 743/967 [05:29<01:11,  3.11it/s] 76%|███████▌  | 731/967 [05:15<01:11,  3.28it/s] 77%|███████▋  | 745/967 [05:30<01:34,  2.35it/s] 76%|███████▌  | 733/967 [05:16<01:36,  2.42it/s] 77%|███████▋  | 746/967 [05:32<02:09,  1.71it/s] 76%|███████▌  | 734/967 [05:18<02:14,  1.73it/s] 77%|███████▋  | 747/967 [05:34<02:44,  1.34it/s] 76%|███████▌  | 735/967 [05:19<02:51,  1.35it/s] 78%|███████▊  | 752/967 [05:34<01:23,  2.58it/s] 77%|███████▋  | 740/967 [05:20<01:25,  2.66it/s] 77%|███████▋  | 742/967 [05:20<01:07,  3.36it/s] 78%|███████▊  | 755/967 [05:34<01:07,  3.13it/s] 77%|███████▋  | 744/967 [05:20<00:59,  3.73it/s] 78%|███████▊  | 757/967 [05:36<01:29,  2.36it/s] 77%|███████▋  | 745/967 [05:22<01:43,  2.15it/s] 78%|███████▊  | 758/967 [05:38<02:01,  1.71it/s] 77%|███████▋  | 746/967 [05:23<02:25,  1.52it/s] 78%|███████▊  | 759/967 [05:39<02:34,  1.34it/s] 77%|███████▋  | 747/967 [05:25<03:05,  1.19it/s] 79%|███████▉  | 764/967 [05:40<01:18,  2.59it/s] 78%|███████▊  | 752/967 [05:25<01:23,  2.58it/s] 78%|███████▊  | 755/967 [05:26<01:05,  3.24it/s] 79%|███████▉  | 767/967 [05:40<01:04,  3.12it/s] 80%|███████▉  | 769/967 [05:42<01:24,  2.35it/s] 78%|███████▊  | 757/967 [05:27<01:28,  2.37it/s] 78%|███████▊  | 758/967 [05:29<02:02,  1.71it/s] 80%|███████▉  | 770/967 [05:43<01:55,  1.71it/s] 78%|███████▊  | 759/967 [05:30<02:36,  1.33it/s] 80%|███████▉  | 771/967 [05:45<02:26,  1.34it/s] 79%|███████▉  | 764/967 [05:31<01:16,  2.66it/s] 80%|████████  | 776/967 [05:45<01:13,  2.59it/s] 79%|███████▉  | 767/967 [05:31<01:00,  3.29it/s] 81%|████████  | 779/967 [05:46<00:59,  3.14it/s] 80%|███████▉  | 769/967 [05:33<01:21,  2.42it/s] 81%|████████  | 781/967 [05:47<01:18,  2.36it/s] 80%|███████▉  | 770/967 [05:34<01:53,  1.73it/s] 81%|████████  | 782/967 [05:49<01:47,  1.72it/s] 80%|███████▉  | 771/967 [05:36<02:25,  1.35it/s] 81%|████████  | 783/967 [05:50<02:17,  1.34it/s] 80%|████████  | 776/967 [05:36<01:12,  2.63it/s] 81%|████████▏ | 788/967 [05:51<01:09,  2.59it/s] 81%|████████  | 779/967 [05:37<00:57,  3.25it/s] 82%|████████▏ | 791/967 [05:51<00:56,  3.13it/s] 81%|████████  | 781/967 [05:38<01:17,  2.40it/s] 82%|████████▏ | 793/967 [05:53<01:13,  2.36it/s] 81%|████████  | 782/967 [05:40<01:46,  1.74it/s] 82%|████████▏ | 794/967 [05:54<01:41,  1.71it/s] 81%|████████  | 783/967 [05:41<02:15,  1.35it/s] 82%|████████▏ | 795/967 [05:56<02:08,  1.33it/s] 81%|████████▏ | 788/967 [05:42<01:07,  2.67it/s] 83%|████████▎ | 800/967 [05:56<01:04,  2.58it/s] 82%|████████▏ | 791/967 [05:42<00:54,  3.25it/s] 83%|████████▎ | 803/967 [05:57<00:52,  3.12it/s] 82%|████████▏ | 793/967 [05:44<01:12,  2.41it/s] 83%|████████▎ | 805/967 [05:58<01:09,  2.34it/s] 82%|████████▏ | 794/967 [05:45<01:39,  1.73it/s] 83%|████████▎ | 806/967 [06:00<01:34,  1.70it/s] 82%|████████▏ | 795/967 [05:47<02:07,  1.34it/s] 83%|████████▎ | 807/967 [06:02<02:00,  1.33it/s] 83%|████████▎ | 800/967 [05:47<01:04,  2.60it/s] 84%|████████▍ | 812/967 [06:02<01:00,  2.57it/s] 83%|████████▎ | 803/967 [05:48<00:52,  3.15it/s] 84%|████████▍ | 815/967 [06:03<00:48,  3.12it/s] 83%|████████▎ | 805/967 [05:49<01:08,  2.35it/s] 84%|████████▍ | 817/967 [06:04<01:03,  2.36it/s] 83%|████████▎ | 806/967 [05:51<01:34,  1.71it/s] 85%|████████▍ | 818/967 [06:06<01:26,  1.72it/s] 83%|████████▎ | 807/967 [05:52<01:59,  1.33it/s] 85%|████████▍ | 819/967 [06:07<01:50,  1.34it/s] 84%|████████▍ | 812/967 [05:53<01:00,  2.58it/s] 85%|████████▌ | 824/967 [06:08<00:55,  2.60it/s] 85%|████████▌ | 826/967 [06:08<00:43,  3.27it/s] 84%|████████▍ | 815/967 [05:53<00:48,  3.12it/s] 86%|████████▌ | 827/967 [06:08<00:45,  3.08it/s] 84%|████████▍ | 817/967 [05:55<01:03,  2.35it/s] 86%|████████▌ | 829/967 [06:10<01:02,  2.19it/s] 85%|████████▍ | 818/967 [05:56<01:27,  1.70it/s] 86%|████████▌ | 830/967 [06:11<01:28,  1.55it/s] 85%|████████▍ | 819/967 [05:58<01:51,  1.33it/s] 86%|████████▌ | 831/967 [06:13<01:51,  1.22it/s] 85%|████████▌ | 824/967 [05:58<00:55,  2.58it/s] 86%|████████▋ | 836/967 [06:13<00:51,  2.55it/s] 86%|████████▌ | 827/967 [05:59<00:44,  3.12it/s] 87%|████████▋ | 839/967 [06:14<00:40,  3.13it/s] 86%|████████▌ | 829/967 [06:00<00:58,  2.36it/s] 87%|████████▋ | 841/967 [06:15<00:53,  2.33it/s] 86%|████████▌ | 830/967 [06:02<01:19,  1.71it/s] 87%|████████▋ | 842/967 [06:17<01:14,  1.68it/s] 86%|████████▌ | 831/967 [06:04<01:41,  1.34it/s] 87%|████████▋ | 843/967 [06:18<01:34,  1.31it/s] 86%|████████▋ | 836/967 [06:04<00:50,  2.59it/s] 88%|████████▊ | 848/967 [06:19<00:46,  2.58it/s] 87%|████████▋ | 839/967 [06:04<00:40,  3.13it/s] 88%|████████▊ | 851/967 [06:19<00:37,  3.12it/s] 87%|████████▋ | 841/967 [06:06<00:53,  2.35it/s] 88%|████████▊ | 853/967 [06:21<00:48,  2.34it/s] 87%|████████▋ | 842/967 [06:08<01:13,  1.71it/s] 88%|████████▊ | 854/967 [06:22<01:06,  1.70it/s] 87%|████████▋ | 843/967 [06:09<01:32,  1.34it/s] 88%|████████▊ | 855/967 [06:24<01:24,  1.33it/s] 88%|████████▊ | 848/967 [06:10<00:45,  2.59it/s] 89%|████████▉ | 860/967 [06:24<00:41,  2.58it/s] 89%|████████▉ | 862/967 [06:25<00:32,  3.26it/s] 88%|████████▊ | 851/967 [06:10<00:37,  3.13it/s] 89%|████████▉ | 863/967 [06:25<00:33,  3.06it/s] 88%|████████▊ | 853/967 [06:12<00:48,  2.36it/s] 89%|████████▉ | 865/967 [06:26<00:46,  2.19it/s] 88%|████████▊ | 854/967 [06:13<01:06,  1.71it/s] 90%|████████▉ | 866/967 [06:28<01:05,  1.55it/s] 88%|████████▊ | 855/967 [06:15<01:23,  1.34it/s] 90%|████████▉ | 867/967 [06:30<01:22,  1.21it/s] 89%|████████▉ | 860/967 [06:15<00:41,  2.59it/s] 90%|█████████ | 872/967 [06:30<00:37,  2.54it/s] 90%|█████████ | 874/967 [06:30<00:28,  3.25it/s] 89%|████████▉ | 863/967 [06:16<00:33,  3.13it/s] 90%|█████████ | 875/967 [06:31<00:30,  3.06it/s] 89%|████████▉ | 865/967 [06:17<00:43,  2.35it/s] 91%|█████████ | 877/967 [06:32<00:41,  2.16it/s] 90%|████████▉ | 866/967 [06:19<00:59,  1.71it/s] 91%|█████████ | 878/967 [06:34<00:58,  1.53it/s] 90%|████████▉ | 867/967 [06:20<01:15,  1.33it/s] 91%|█████████ | 879/967 [06:35<01:13,  1.20it/s] 90%|█████████ | 872/967 [06:21<00:36,  2.58it/s] 91%|█████████▏| 884/967 [06:36<00:32,  2.56it/s] 92%|█████████▏| 886/967 [06:36<00:24,  3.27it/s] 90%|█████████ | 875/967 [06:21<00:29,  3.12it/s] 92%|█████████▏| 887/967 [06:36<00:26,  3.07it/s] 91%|█████████ | 877/967 [06:23<00:38,  2.34it/s] 92%|█████████▏| 889/967 [06:38<00:35,  2.17it/s] 91%|█████████ | 878/967 [06:24<00:52,  1.70it/s] 92%|█████████▏| 890/967 [06:39<00:50,  1.53it/s] 91%|█████████ | 879/967 [06:26<01:06,  1.33it/s] 92%|█████████▏| 891/967 [06:41<01:03,  1.20it/s] 91%|█████████▏| 884/967 [06:26<00:32,  2.58it/s] 93%|█████████▎| 896/967 [06:41<00:27,  2.55it/s] 92%|█████████▏| 887/967 [06:27<00:25,  3.12it/s] 93%|█████████▎| 899/967 [06:42<00:21,  3.13it/s] 92%|█████████▏| 889/967 [06:29<00:33,  2.34it/s] 93%|█████████▎| 901/967 [06:43<00:28,  2.34it/s] 92%|█████████▏| 890/967 [06:30<00:45,  1.71it/s] 93%|█████████▎| 902/967 [06:45<00:38,  1.70it/s] 92%|█████████▏| 891/967 [06:32<00:56,  1.34it/s] 93%|█████████▎| 903/967 [06:46<00:48,  1.32it/s] 93%|█████████▎| 896/967 [06:32<00:27,  2.59it/s] 94%|█████████▍| 908/967 [06:47<00:22,  2.59it/s] 93%|█████████▎| 899/967 [06:33<00:21,  3.14it/s] 94%|█████████▍| 911/967 [06:47<00:17,  3.14it/s] 93%|█████████▎| 901/967 [06:34<00:28,  2.35it/s] 94%|█████████▍| 913/967 [06:49<00:22,  2.35it/s] 93%|█████████▎| 902/967 [06:36<00:38,  1.71it/s] 95%|█████████▍| 914/967 [06:50<00:31,  1.71it/s] 93%|█████████▎| 903/967 [06:37<00:47,  1.34it/s] 95%|█████████▍| 915/967 [06:52<00:38,  1.34it/s] 94%|█████████▍| 908/967 [06:38<00:22,  2.59it/s] 95%|█████████▌| 920/967 [06:52<00:18,  2.58it/s] 94%|█████████▍| 911/967 [06:38<00:17,  3.13it/s] 95%|█████████▌| 923/967 [06:53<00:14,  3.13it/s] 94%|█████████▍| 913/967 [06:40<00:22,  2.36it/s] 96%|█████████▌| 925/967 [06:54<00:17,  2.36it/s] 95%|█████████▍| 914/967 [06:41<00:30,  1.72it/s] 96%|█████████▌| 926/967 [06:56<00:23,  1.71it/s] 95%|█████████▍| 915/967 [06:43<00:38,  1.34it/s] 96%|█████████▌| 927/967 [06:58<00:29,  1.34it/s] 95%|█████████▌| 920/967 [06:43<00:18,  2.59it/s] 96%|█████████▋| 932/967 [06:58<00:13,  2.60it/s] 97%|█████████▋| 934/967 [06:58<00:10,  3.27it/s] 95%|█████████▌| 923/967 [06:44<00:14,  3.13it/s] 97%|█████████▋| 935/967 [06:59<00:10,  3.07it/s] 96%|█████████▌| 925/967 [06:45<00:17,  2.35it/s] 97%|█████████▋| 937/967 [07:00<00:13,  2.19it/s] 96%|█████████▌| 926/967 [06:47<00:24,  1.71it/s] 97%|█████████▋| 938/967 [07:02<00:18,  1.55it/s] 96%|█████████▌| 927/967 [06:48<00:30,  1.33it/s] 97%|█████████▋| 939/967 [07:03<00:22,  1.22it/s] 96%|█████████▋| 932/967 [06:49<00:13,  2.58it/s] 97%|█████████▋| 934/967 [06:49<00:10,  3.25it/s] 98%|█████████▊| 944/967 [07:04<00:08,  2.57it/s] 97%|█████████▋| 935/967 [06:49<00:10,  3.06it/s] 98%|█████████▊| 947/967 [07:04<00:06,  3.15it/s] 97%|█████████▋| 937/967 [06:51<00:13,  2.19it/s] 98%|█████████▊| 949/967 [07:06<00:07,  2.34it/s] 97%|█████████▋| 938/967 [06:52<00:18,  1.54it/s] 98%|█████████▊| 950/967 [07:07<00:09,  1.70it/s] 97%|█████████▋| 939/967 [06:54<00:23,  1.21it/s] 98%|█████████▊| 951/967 [07:09<00:12,  1.33it/s] 98%|█████████▊| 944/967 [06:54<00:09,  2.54it/s] 98%|█████████▊| 946/967 [06:55<00:06,  3.25it/s] 99%|█████████▉| 956/967 [07:09<00:04,  2.61it/s] 98%|█████████▊| 947/967 [06:55<00:06,  3.05it/s] 99%|█████████▉| 959/967 [07:10<00:02,  3.15it/s] 98%|█████████▊| 949/967 [06:57<00:08,  2.16it/s] 99%|█████████▉| 961/967 [07:11<00:02,  2.37it/s] 98%|█████████▊| 950/967 [06:58<00:11,  1.52it/s] 99%|█████████▉| 962/967 [07:13<00:02,  1.72it/s] 98%|█████████▊| 951/967 [07:00<00:13,  1.20it/s]100%|█████████▉| 963/967 [07:14<00:02,  1.35it/s] 99%|█████████▉| 956/967 [07:00<00:04,  2.54it/s] 99%|█████████▉| 959/967 [07:01<00:02,  3.11it/s]100%|██████████| 967/967 [07:16<00:00,  1.70it/s]100%|██████████| 967/967 [07:16<00:00,  2.22it/s]
+[17:21:36.118938] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-79): 80 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=28672, out_features=8192, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=8192, out_features=32000, bias=False)
+    )
+  )
+)
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+ 99%|█████████▉| 961/967 [07:02<00:02,  2.33it/s] 99%|█████████▉| 962/967 [07:04<00:02,  1.69it/s]100%|█████████▉| 963/967 [07:05<00:03,  1.32it/s]100%|██████████| 967/967 [07:07<00:00,  1.68it/s]100%|██████████| 967/967 [07:07<00:00,  2.26it/s]
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[17:23:31.139903] apply gradient checkpointing
+[17:23:31.193739] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-79): 80 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=28672, out_features=8192, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([8192]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=8192, out_features=32000, bias=False)
+      )
+    )
+  )
+)
+[17:23:31.193771] effective batch size: 16
+[17:23:31.200029] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[17:23:31.200139] read dataset config from configs/data/finetune/sg/alpaca.yaml
+[17:23:31.200762] DATASET CONFIG:
+[17:23:31.200774] {'META': [['../data/alpaca_gpt4_data.json', 'text']]}
+[17:23:31.351178] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[17:23:31.352843] total length: 52002
+[17:23:31.362785] <data.alpaca.FinetuneDataset object at 0x7f2e083578b0>
+[17:23:31.365017] Start training for 4 epochs
+[17:23:31.374088] log_dir: ./output_dir
+[17:23:41.365961] Epoch: [0]  [0/6500]  lr: 0.000000  closs: 0.6425 (0.6425)  time: 9.9894  data: 2.4921  max mem: 55263
+[17:24:36.776017] Epoch: [0]  [10/6500]  lr: 0.000000  closs: 1.1259 (1.1314)  grad_norm: 1.2729 (1.2429)  time: 5.9453  data: 0.2268  max mem: 71357
+[17:25:32.743504] Epoch: [0]  [20/6500]  lr: 0.000000  closs: 0.9694 (1.0205)  grad_norm: 1.2619 (1.2698)  time: 5.5688  data: 0.0002  max mem: 71357
+[17:26:28.403906] Epoch: [0]  [30/6500]  lr: 0.000000  closs: 0.9322 (1.0122)  grad_norm: 1.2729 (1.3065)  time: 5.5813  data: 0.0002  max mem: 71357
+[17:27:24.071878] Epoch: [0]  [40/6500]  lr: 0.000000  closs: 0.9322 (1.0139)  grad_norm: 1.1836 (1.2646)  time: 5.5663  data: 0.0002  max mem: 71357
+[17:28:19.797340] Epoch: [0]  [50/6500]  lr: 0.000000  closs: 0.9739 (1.0026)  grad_norm: 1.1697 (1.2362)  time: 5.5696  data: 0.0002  max mem: 71357
+[17:29:15.536579] Epoch: [0]  [60/6500]  lr: 0.000000  closs: 0.9523 (0.9969)  grad_norm: 1.0358 (1.2359)  time: 5.5732  data: 0.0002  max mem: 71357
+[17:30:11.306733] Epoch: [0]  [70/6500]  lr: 0.000001  closs: 0.9465 (0.9975)  grad_norm: 1.0358 (1.2176)  time: 5.5754  data: 0.0002  max mem: 71357
+[17:31:06.992357] Epoch: [0]  [80/6500]  lr: 0.000001  closs: 0.9465 (0.9946)  grad_norm: 1.0641 (1.2193)  time: 5.5727  data: 0.0002  max mem: 71357
+[17:32:02.793097] Epoch: [0]  [90/6500]  lr: 0.000001  closs: 0.9669 (0.9922)  grad_norm: 1.0981 (1.2193)  time: 5.5742  data: 0.0001  max mem: 71357
+[17:32:58.479760] Epoch: [0]  [100/6500]  lr: 0.000001  closs: 0.9693 (0.9890)  grad_norm: 1.1596 (1.2120)  time: 5.5743  data: 0.0001  max mem: 71357
+[17:33:54.306944] Epoch: [0]  [110/6500]  lr: 0.000001  closs: 0.9520 (0.9836)  grad_norm: 1.1596 (1.2033)  time: 5.5756  data: 0.0002  max mem: 71357
+[17:34:50.074430] Epoch: [0]  [120/6500]  lr: 0.000001  closs: 0.9365 (0.9833)  grad_norm: 1.0410 (1.1872)  time: 5.5797  data: 0.0002  max mem: 71357
+[17:35:45.809543] Epoch: [0]  [130/6500]  lr: 0.000001  closs: 0.9427 (0.9864)  grad_norm: 1.0198 (1.1784)  time: 5.5750  data: 0.0002  max mem: 71357
+[17:36:41.515666] Epoch: [0]  [140/6500]  lr: 0.000001  closs: 0.9960 (0.9895)  grad_norm: 1.0198 (1.1752)  time: 5.5719  data: 0.0002  max mem: 71357
+[17:37:37.376263] Epoch: [0]  [150/6500]  lr: 0.000001  closs: 0.9367 (0.9828)  grad_norm: 1.0198 (1.1735)  time: 5.5782  data: 0.0002  max mem: 71357
+[17:38:33.178090] Epoch: [0]  [160/6500]  lr: 0.000001  closs: 0.8710 (0.9798)  grad_norm: 1.0702 (1.1935)  time: 5.5830  data: 0.0002  max mem: 71357
+[17:39:28.866051] Epoch: [0]  [170/6500]  lr: 0.000001  closs: 0.8168 (0.9763)  grad_norm: 1.1173 (1.1967)  time: 5.5744  data: 0.0002  max mem: 71357
+[17:40:24.644969] Epoch: [0]  [180/6500]  lr: 0.000001  closs: 0.9151 (0.9716)  grad_norm: 1.0284 (1.1866)  time: 5.5733  data: 0.0002  max mem: 71357
+[17:41:20.449725] Epoch: [0]  [190/6500]  lr: 0.000001  closs: 0.9151 (0.9690)  grad_norm: 1.0219 (1.1783)  time: 5.5791  data: 0.0001  max mem: 71357
+[17:42:16.173660] Epoch: [0]  [200/6500]  lr: 0.000002  closs: 0.9091 (0.9631)  grad_norm: 1.0265 (1.1808)  time: 5.5763  data: 0.0001  max mem: 71357
+[17:43:11.896862] Epoch: [0]  [210/6500]  lr: 0.000002  closs: 0.9030 (0.9615)  grad_norm: 1.0265 (1.1799)  time: 5.5722  data: 0.0001  max mem: 71357
+[17:44:07.617751] Epoch: [0]  [220/6500]  lr: 0.000002  closs: 0.9008 (0.9594)  grad_norm: 1.0652 (1.1861)  time: 5.5721  data: 0.0001  max mem: 71357
+[17:45:03.381846] Epoch: [0]  [230/6500]  lr: 0.000002  closs: 0.8917 (0.9581)  grad_norm: 1.1156 (1.1990)  time: 5.5742  data: 0.0001  max mem: 71357
+[17:45:59.158047] Epoch: [0]  [240/6500]  lr: 0.000002  closs: 0.9274 (0.9582)  grad_norm: 1.1156 (1.2056)  time: 5.5769  data: 0.0001  max mem: 71357
+[17:46:54.838196] Epoch: [0]  [250/6500]  lr: 0.000002  closs: 0.9450 (0.9598)  grad_norm: 1.1180 (1.2000)  time: 5.5727  data: 0.0001  max mem: 71357
+[17:47:50.633838] Epoch: [0]  [260/6500]  lr: 0.000002  closs: 0.9151 (0.9571)  grad_norm: 1.0244 (1.1863)  time: 5.5737  data: 0.0002  max mem: 71357
+[17:48:46.374345] Epoch: [0]  [270/6500]  lr: 0.000002  closs: 0.8736 (0.9565)  grad_norm: 0.9105 (1.1753)  time: 5.5767  data: 0.0001  max mem: 71357
+[17:49:42.139732] Epoch: [0]  [280/6500]  lr: 0.000002  closs: 0.8896 (0.9573)  grad_norm: 1.0178 (1.1818)  time: 5.5752  data: 0.0002  max mem: 71357
+[17:50:37.920786] Epoch: [0]  [290/6500]  lr: 0.000002  closs: 0.9003 (0.9554)  grad_norm: 0.8901 (1.1759)  time: 5.5772  data: 0.0002  max mem: 71357
+[17:51:33.665702] Epoch: [0]  [300/6500]  lr: 0.000002  closs: 0.8788 (0.9538)  grad_norm: 1.0452 (1.1730)  time: 5.5762  data: 0.0001  max mem: 71357
+[17:52:29.335217] Epoch: [0]  [310/6500]  lr: 0.000002  closs: 0.8900 (0.9538)  grad_norm: 1.0911 (1.1693)  time: 5.5706  data: 0.0001  max mem: 71357
+[17:53:25.050732] Epoch: [0]  [320/6500]  lr: 0.000002  closs: 0.8907 (0.9528)  grad_norm: 0.9340 (1.1656)  time: 5.5692  data: 0.0001  max mem: 71357
+[17:54:20.840942] Epoch: [0]  [330/6500]  lr: 0.000003  closs: 0.9088 (0.9559)  grad_norm: 0.9319 (1.1552)  time: 5.5752  data: 0.0002  max mem: 71357
+[17:55:16.512912] Epoch: [0]  [340/6500]  lr: 0.000003  closs: 0.8765 (0.9528)  grad_norm: 0.8839 (1.1505)  time: 5.5730  data: 0.0002  max mem: 71357
+[17:56:12.171744] Epoch: [0]  [350/6500]  lr: 0.000003  closs: 0.8691 (0.9530)  grad_norm: 0.8736 (1.1448)  time: 5.5664  data: 0.0001  max mem: 71357
+[17:57:07.866853] Epoch: [0]  [360/6500]  lr: 0.000003  closs: 0.8886 (0.9515)  grad_norm: 0.8445 (1.1351)  time: 5.5676  data: 0.0001  max mem: 71357
+[17:58:03.685808] Epoch: [0]  [370/6500]  lr: 0.000003  closs: 0.9019 (0.9531)  grad_norm: 0.8445 (1.1251)  time: 5.5756  data: 0.0001  max mem: 71357
+[17:58:59.457163] Epoch: [0]  [380/6500]  lr: 0.000003  closs: 0.9910 (0.9514)  grad_norm: 0.8201 (1.1156)  time: 5.5794  data: 0.0001  max mem: 71357
+[17:59:55.171125] Epoch: [0]  [390/6500]  lr: 0.000003  closs: 0.8344 (0.9485)  grad_norm: 0.7503 (1.1046)  time: 5.5742  data: 0.0001  max mem: 71357
+[18:00:50.857523] Epoch: [0]  [400/6500]  lr: 0.000003  closs: 0.8829 (0.9491)  grad_norm: 0.7532 (1.0999)  time: 5.5699  data: 0.0001  max mem: 71357
+[18:01:46.766039] Epoch: [0]  [410/6500]  lr: 0.000003  closs: 0.8405 (0.9436)  grad_norm: 0.7532 (1.0935)  time: 5.5797  data: 0.0001  max mem: 71357
+[18:02:42.491407] Epoch: [0]  [420/6500]  lr: 0.000003  closs: 0.7425 (0.9431)  grad_norm: 0.7532 (1.0857)  time: 5.5816  data: 0.0001  max mem: 71357
+[18:03:38.223961] Epoch: [0]  [430/6500]  lr: 0.000003  closs: 0.8892 (0.9426)  grad_norm: 0.7592 (1.0802)  time: 5.5728  data: 0.0001  max mem: 71357
+[18:04:33.923170] Epoch: [0]  [440/6500]  lr: 0.000003  closs: 0.8794 (0.9410)  grad_norm: 0.6973 (1.0741)  time: 5.5715  data: 0.0001  max mem: 71357
+[18:05:29.628857] Epoch: [0]  [450/6500]  lr: 0.000003  closs: 0.8692 (0.9410)  grad_norm: 0.6973 (1.0658)  time: 5.5702  data: 0.0001  max mem: 71357
+[18:06:25.467666] Epoch: [0]  [460/6500]  lr: 0.000004  closs: 0.8951 (0.9400)  grad_norm: 0.6579 (1.0562)  time: 5.5772  data: 0.0001  max mem: 71357
+[18:07:21.090474] Epoch: [0]  [470/6500]  lr: 0.000004  closs: 0.9276 (0.9407)  grad_norm: 0.6579 (1.0511)  time: 5.5730  data: 0.0001  max mem: 71357
+[18:08:16.811633] Epoch: [0]  [480/6500]  lr: 0.000004  closs: 0.9387 (0.9406)  grad_norm: 0.6530 (1.0441)  time: 5.5671  data: 0.0001  max mem: 71357
+[18:09:12.584026] Epoch: [0]  [490/6500]  lr: 0.000004  closs: 0.8099 (0.9383)  grad_norm: 0.6177 (1.0377)  time: 5.5746  data: 0.0001  max mem: 71357
+[18:10:08.482777] Epoch: [0]  [500/6500]  lr: 0.000004  closs: 0.7688 (0.9347)  grad_norm: 0.6177 (1.0309)  time: 5.5835  data: 0.0001  max mem: 71357
+[18:11:04.244325] Epoch: [0]  [510/6500]  lr: 0.000004  closs: 0.7851 (0.9333)  grad_norm: 0.6626 (1.0246)  time: 5.5829  data: 0.0001  max mem: 71357
+[18:11:59.905130] Epoch: [0]  [520/6500]  lr: 0.000004  closs: 0.8155 (0.9318)  grad_norm: 0.6998 (1.0221)  time: 5.5710  data: 0.0001  max mem: 71357
+[18:12:55.576114] Epoch: [0]  [530/6500]  lr: 0.000004  closs: 0.7851 (0.9303)  grad_norm: 0.6998 (1.0148)  time: 5.5665  data: 0.0001  max mem: 71357
+[18:13:51.221974] Epoch: [0]  [540/6500]  lr: 0.000004  closs: 0.8393 (0.9295)  grad_norm: 0.6998 (1.0106)  time: 5.5657  data: 0.0001  max mem: 71357
+[18:14:47.001446] Epoch: [0]  [550/6500]  lr: 0.000004  closs: 0.9264 (0.9299)  grad_norm: 0.7249 (1.0064)  time: 5.5711  data: 0.0001  max mem: 71357
+[18:15:42.665872] Epoch: [0]  [560/6500]  lr: 0.000004  closs: 0.8648 (0.9292)  grad_norm: 0.7504 (1.0082)  time: 5.5721  data: 0.0001  max mem: 71357
+[18:16:38.396948] Epoch: [0]  [570/6500]  lr: 0.000004  closs: 0.8370 (0.9284)  grad_norm: 0.8616 (1.0058)  time: 5.5697  data: 0.0001  max mem: 71357
+[18:17:34.143183] Epoch: [0]  [580/6500]  lr: 0.000004  closs: 0.8808 (0.9264)  grad_norm: 0.8343 (1.0013)  time: 5.5738  data: 0.0001  max mem: 71357
+[18:18:29.970942] Epoch: [0]  [590/6500]  lr: 0.000005  closs: 0.8689 (0.9260)  grad_norm: 0.8343 (0.9999)  time: 5.5786  data: 0.0001  max mem: 71357
+[18:19:25.719357] Epoch: [0]  [600/6500]  lr: 0.000005  closs: 0.8692 (0.9257)  grad_norm: 0.8343 (1.0018)  time: 5.5787  data: 0.0002  max mem: 71357
+[18:20:21.373952] Epoch: [0]  [610/6500]  lr: 0.000005  closs: 0.8144 (0.9231)  grad_norm: 0.6895 (1.0004)  time: 5.5701  data: 0.0002  max mem: 71357
+[18:21:17.129310] Epoch: [0]  [620/6500]  lr: 0.000005  closs: 0.8144 (0.9223)  grad_norm: 0.6895 (0.9955)  time: 5.5704  data: 0.0001  max mem: 71357
+[18:22:12.912488] Epoch: [0]  [630/6500]  lr: 0.000005  closs: 0.8523 (0.9212)  grad_norm: 0.7028 (0.9910)  time: 5.5768  data: 0.0001  max mem: 71357
+[18:23:08.618409] Epoch: [0]  [640/6500]  lr: 0.000005  closs: 0.8383 (0.9202)  grad_norm: 0.7028 (0.9864)  time: 5.5743  data: 0.0001  max mem: 71357
+[18:24:04.348829] Epoch: [0]  [650/6500]  lr: 0.000005  closs: 0.8570 (0.9197)  grad_norm: 0.7166 (0.9852)  time: 5.5717  data: 0.0002  max mem: 71357
+[18:25:00.006201] Epoch: [0]  [660/6500]  lr: 0.000005  closs: 0.8232 (0.9176)  grad_norm: 0.7285 (0.9834)  time: 5.5693  data: 0.0002  max mem: 71357
+[18:25:55.728679] Epoch: [0]  [670/6500]  lr: 0.000005  closs: 0.7907 (0.9164)  grad_norm: 0.7166 (0.9780)  time: 5.5689  data: 0.0002  max mem: 71357
+[18:26:51.493602] Epoch: [0]  [680/6500]  lr: 0.000005  closs: 0.8000 (0.9157)  grad_norm: 0.7775 (0.9767)  time: 5.5743  data: 0.0001  max mem: 71357
+[18:27:47.259175] Epoch: [0]  [690/6500]  lr: 0.000005  closs: 0.8076 (0.9146)  grad_norm: 0.6919 (0.9753)  time: 5.5764  data: 0.0001  max mem: 71357
+[18:28:42.942644] Epoch: [0]  [700/6500]  lr: 0.000005  closs: 0.7424 (0.9123)  grad_norm: 0.6303 (0.9723)  time: 5.5723  data: 0.0001  max mem: 71357
+[18:29:38.581234] Epoch: [0]  [710/6500]  lr: 0.000005  closs: 0.7424 (0.9105)  grad_norm: 0.7645 (0.9741)  time: 5.5660  data: 0.0002  max mem: 71357
+[18:30:34.409782] Epoch: [0]  [720/6500]  lr: 0.000006  closs: 0.7110 (0.9079)  grad_norm: 0.7526 (0.9709)  time: 5.5732  data: 0.0002  max mem: 71357
+[18:31:30.096730] Epoch: [0]  [730/6500]  lr: 0.000006  closs: 0.7914 (0.9071)  grad_norm: 0.6638 (0.9656)  time: 5.5757  data: 0.0001  max mem: 71357
+[18:32:25.758320] Epoch: [0]  [740/6500]  lr: 0.000006  closs: 0.8183 (0.9059)  grad_norm: 0.6975 (0.9641)  time: 5.5673  data: 0.0001  max mem: 71357
+[18:33:21.499555] Epoch: [0]  [750/6500]  lr: 0.000006  closs: 0.7790 (0.9051)  grad_norm: 0.6621 (0.9590)  time: 5.5700  data: 0.0001  max mem: 71357
+[18:34:17.084158] Epoch: [0]  [760/6500]  lr: 0.000006  closs: 0.8239 (0.9048)  grad_norm: 0.6638 (0.9582)  time: 5.5662  data: 0.0001  max mem: 71357
+[18:35:12.977650] Epoch: [0]  [770/6500]  lr: 0.000006  closs: 0.8609 (0.9039)  grad_norm: 0.6498 (0.9539)  time: 5.5738  data: 0.0001  max mem: 71357
+[18:36:08.664952] Epoch: [0]  [780/6500]  lr: 0.000006  closs: 0.7882 (0.9016)  grad_norm: 0.6142 (0.9481)  time: 5.5789  data: 0.0002  max mem: 71357
+[18:37:04.413888] Epoch: [0]  [790/6500]  lr: 0.000006  closs: 0.7919 (0.9005)  grad_norm: 0.6271 (0.9440)  time: 5.5717  data: 0.0001  max mem: 71357
+[18:38:00.067141] Epoch: [0]  [800/6500]  lr: 0.000006  closs: 0.8323 (0.9000)  grad_norm: 0.5438 (0.9401)  time: 5.5700  data: 0.0001  max mem: 71357
+[18:38:55.953874] Epoch: [0]  [810/6500]  lr: 0.000006  closs: 0.8323 (0.8978)  grad_norm: 0.5438 (0.9353)  time: 5.5769  data: 0.0001  max mem: 71357
+[18:39:51.627904] Epoch: [0]  [820/6500]  lr: 0.000006  closs: 0.7819 (0.8968)  grad_norm: 0.5438 (0.9310)  time: 5.5779  data: 0.0002  max mem: 71357
+[18:40:47.301617] Epoch: [0]  [830/6500]  lr: 0.000006  closs: 0.7707 (0.8952)  grad_norm: 0.5119 (0.9265)  time: 5.5673  data: 0.0002  max mem: 71357
+[18:41:43.054599] Epoch: [0]  [840/6500]  lr: 0.000006  closs: 0.7601 (0.8950)  grad_norm: 0.5119 (0.9249)  time: 5.5712  data: 0.0001  max mem: 71357
+[18:42:38.803150] Epoch: [0]  [850/6500]  lr: 0.000007  closs: 0.8690 (0.8952)  grad_norm: 0.5323 (0.9204)  time: 5.5749  data: 0.0001  max mem: 71357
+[18:43:34.505421] Epoch: [0]  [860/6500]  lr: 0.000007  closs: 0.8132 (0.8943)  grad_norm: 0.5332 (0.9166)  time: 5.5724  data: 0.0002  max mem: 71357
+[18:44:30.116370] Epoch: [0]  [870/6500]  lr: 0.000007  closs: 0.8058 (0.8946)  grad_norm: 0.5786 (0.9147)  time: 5.5655  data: 0.0002  max mem: 71357
+[18:45:25.754743] Epoch: [0]  [880/6500]  lr: 0.000007  closs: 0.8048 (0.8936)  grad_norm: 0.5437 (0.9109)  time: 5.5623  data: 0.0002  max mem: 71357
+[18:46:21.437868] Epoch: [0]  [890/6500]  lr: 0.000007  closs: 0.7226 (0.8921)  grad_norm: 0.5817 (0.9073)  time: 5.5659  data: 0.0001  max mem: 71357
+[18:47:17.526399] Epoch: [0]  [900/6500]  lr: 0.000007  closs: 0.7589 (0.8912)  grad_norm: 0.6070 (0.9076)  time: 5.5884  data: 0.0001  max mem: 71357
+[18:48:13.144234] Epoch: [0]  [910/6500]  lr: 0.000007  closs: 0.8236 (0.8904)  grad_norm: 0.5817 (0.9036)  time: 5.5852  data: 0.0002  max mem: 71357
+[18:49:08.786046] Epoch: [0]  [920/6500]  lr: 0.000007  closs: 0.8949 (0.8904)  grad_norm: 0.5817 (0.9000)  time: 5.5629  data: 0.0002  max mem: 71357
+[18:50:04.500343] Epoch: [0]  [930/6500]  lr: 0.000007  closs: 0.8826 (0.8899)  grad_norm: 0.5138 (0.8955)  time: 5.5677  data: 0.0002  max mem: 71357
+[18:51:00.386039] Epoch: [0]  [940/6500]  lr: 0.000007  closs: 0.8056 (0.8898)  grad_norm: 0.5107 (0.8927)  time: 5.5799  data: 0.0002  max mem: 71357
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use).
+[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use).
+[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
+Traceback (most recent call last):
+  File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
+    run(args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
+    result = agent.run()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
+    result = self._invoke_run(role)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run
+    self._initialize_workers(self._worker_group)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers
+    self._rendezvous(worker_group)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous
+    store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous
+    self._store = TCPStore(  # type: ignore[call-arg]
+RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use).
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+> [0;32m/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py[0m(134)[0;36mmain[0;34m()[0m
+[0;32m    133 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m;[0m[0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m[0;32m--> 134 [0;31m    [0mmisc[0m[0;34m.[0m[0minit_distributed_mode[0m[0;34m([0m[0margs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m[0;32m    135 [0;31m    [0mfs_init[0m[0;34m.[0m[0minitialize_model_parallel[0m[0;34m([0m[0margs[0m[0;34m.[0m[0mmodel_parallel_size[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m
+ipdb> *** SyntaxError: invalid syntax
+ipdb> > [0;32m/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py[0m(134)[0;36mmain[0;34m()[0m
+[0;32m    133 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m;[0m[0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m[0;32m--> 134 [0;31m    [0mmisc[0m[0;34m.[0m[0minit_distributed_mode[0m[0;34m([0m[0margs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m[0;32m    135 [0;31m    [0mfs_init[0m[0;34m.[0m[0minitialize_model_parallel[0m[0;34m([0m[0margs[0m[0;34m.[0m[0mmodel_parallel_size[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
+[0m
+ipdb> [18:51:56.159949] Epoch: [0]  [950/6500]  lr: 0.000007  closs: 0.8056 (0.8895)  grad_norm: 0.5041 (0.8887)  time: 5.5829  data: 0.0002  max mem: 71357
+[18:52:51.944635] Epoch: [0]  [960/6500]  lr: 0.000007  closs: 0.8003 (0.8885)  grad_norm: 0.4752 (0.8862)  time: 5.5779  data: 0.0002  max mem: 71357
+True
+ipdb> [18:53:47.670278] Epoch: [0]  [970/6500]  lr: 0.000007  closs: 0.8003 (0.8873)  grad_norm: 0.5352 (0.8829)  time: 5.5754  data: 0.0001  max mem: 71357
+[18:54:43.331543] Epoch: [0]  [980/6500]  lr: 0.000008  closs: 0.7697 (0.8869)  grad_norm: 0.5414 (0.8806)  time: 5.5692  data: 0.0002  max mem: 71357
+[18:55:39.144294] Epoch: [0]  [990/6500]  lr: 0.000008  closs: 0.7697 (0.8861)  grad_norm: 0.5501 (0.8777)  time: 5.5736  data: 0.0002  max mem: 71357
+[18:56:34.832623] Epoch: [0]  [1000/6500]  lr: 0.000008  closs: 0.7968 (0.8853)  grad_norm: 0.5508 (0.8757)  time: 5.5750  data: 0.0001  max mem: 71357
+[18:57:30.584157] Epoch: [0]  [1010/6500]  lr: 0.000008  closs: 0.8029 (0.8847)  grad_norm: 0.6143 (0.8732)  time: 5.5719  data: 0.0001  max mem: 71357
+[18:58:26.208202] Epoch: [0]  [1020/6500]  lr: 0.000008  closs: 0.8208 (0.8845)  grad_norm: 0.6465 (0.8734)  time: 5.5687  data: 0.0001  max mem: 71357
+[18:59:22.004799] Epoch: [0]  [1030/6500]  lr: 0.000008  closs: 0.7216 (0.8831)  grad_norm: 0.6336 (0.8707)  time: 5.5709  data: 0.0002  max mem: 71357
+[19:00:17.779495] Epoch: [0]  [1040/6500]  lr: 0.000008  closs: 0.7816 (0.8825)  grad_norm: 0.5328 (0.8678)  time: 5.5785  data: 0.0002  max mem: 71357
+[19:01:13.491558] Epoch: [0]  [1050/6500]  lr: 0.000008  closs: 0.8551 (0.8816)  grad_norm: 0.5170 (0.8638)  time: 5.5742  data: 0.0001  max mem: 71357
+[19:02:09.272274] Epoch: [0]  [1060/6500]  lr: 0.000008  closs: 0.7913 (0.8810)  grad_norm: 0.5079 (0.8599)  time: 5.5745  data: 0.0001  max mem: 71357
+[19:03:05.080375] Epoch: [0]  [1070/6500]  lr: 0.000008  closs: 0.8080 (0.8807)  grad_norm: 0.4961 (0.8577)  time: 5.5794  data: 0.0001  max mem: 71357
+[19:04:00.844337] Epoch: [0]  [1080/6500]  lr: 0.000008  closs: 0.8225 (0.8806)  grad_norm: 0.4745 (0.8547)  time: 5.5785  data: 0.0001  max mem: 71357
+
+Traceback (most recent call last):
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 319, in <module>
+    main(args)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 134, in main
+    misc.init_distributed_mode(args)
+  File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 134, in main
+    misc.init_distributed_mode(args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+
+If you suspect this is an IPython 8.14.0 bug, please report it at:
+    https://github.com/ipython/ipython/issues
+or send an email to the mailing list at ipython-dev@python.org
+
+You can print a more detailed traceback right now with "%tb", or use "%debug"
+to interactively debug it.
+
+Extra-detailed tracebacks for bug-reporting purposes can be enabled via:
+    %config Application.verbose_crash=True
+
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 130230 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 130231) of binary: /data/anaconda3/envs/accessory/bin/python3.10
+Traceback (most recent call last):
+  File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
+    run(args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+main_finetune.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-10_19:04:32
+  host      : iZ2ze8qpzapxkhyc9k2qojZ
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 130231)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+[19:04:56.595547] Epoch: [0]  [1090/6500]  lr: 0.000008  closs: 0.8706 (0.8802)  grad_norm: 0.4761 (0.8526)  time: 5.5757  data: 0.0001  max mem: 71357
+[19:05:52.420471] Epoch: [0]  [1100/6500]  lr: 0.000008  closs: 0.8706 (0.8799)  grad_norm: 0.4761 (0.8506)  time: 5.5787  data: 0.0001  max mem: 71357
+[19:06:48.151002] Epoch: [0]  [1110/6500]  lr: 0.000009  closs: 0.8431 (0.8795)  grad_norm: 0.4751 (0.8474)  time: 5.5777  data: 0.0001  max mem: 71357
+[19:07:44.022831] Epoch: [0]  [1120/6500]  lr: 0.000009  closs: 0.7763 (0.8782)  grad_norm: 0.4751 (0.8446)  time: 5.5800  data: 0.0001  max mem: 71357
+[19:08:39.741796] Epoch: [0]  [1130/6500]  lr: 0.000009  closs: 0.7718 (0.8775)  grad_norm: 0.5106 (0.8439)  time: 5.5794  data: 0.0001  max mem: 71357
+[19:09:35.445169] Epoch: [0]  [1140/6500]  lr: 0.000009  closs: 0.7377 (0.8759)  grad_norm: 0.5852 (0.8474)  time: 5.5710  data: 0.0001  max mem: 71357
+[19:10:31.229899] Epoch: [0]  [1150/6500]  lr: 0.000009  closs: 0.7763 (0.8757)  grad_norm: 0.5936 (0.8482)  time: 5.5743  data: 0.0002  max mem: 71357
+[19:11:27.112847] Epoch: [0]  [1160/6500]  lr: 0.000009  closs: 0.8189 (0.8754)  grad_norm: 0.5936 (0.8451)  time: 5.5833  data: 0.0002  max mem: 71357
+[19:12:22.770834] Epoch: [0]  [1170/6500]  lr: 0.000009  closs: 0.7905 (0.8742)  grad_norm: 0.5854 (0.8432)  time: 5.5769  data: 0.0002  max mem: 71357
+[19:13:18.473532] Epoch: [0]  [1180/6500]  lr: 0.000009  closs: 0.7743 (0.8739)  grad_norm: 0.4815 (0.8403)  time: 5.5679  data: 0.0001  max mem: 71357
+[19:14:14.090142] Epoch: [0]  [1190/6500]  lr: 0.000009  closs: 0.8370 (0.8738)  grad_norm: 0.4815 (0.8390)  time: 5.5659  data: 0.0001  max mem: 71357
+[19:15:09.821953] Epoch: [0]  [1200/6500]  lr: 0.000009  closs: 0.7973 (0.8732)  grad_norm: 0.5191 (0.8370)  time: 5.5673  data: 0.0001  max mem: 71357
+[19:16:05.632464] Epoch: [0]  [1210/6500]  lr: 0.000009  closs: 0.7952 (0.8725)  grad_norm: 0.5191 (0.8346)  time: 5.5770  data: 0.0001  max mem: 71357
+[19:17:01.237737] Epoch: [0]  [1220/6500]  lr: 0.000009  closs: 0.7952 (0.8720)  grad_norm: 0.5838 (0.8334)  time: 5.5707  data: 0.0001  max mem: 71357
+[19:17:56.918626] Epoch: [0]  [1230/6500]  lr: 0.000009  closs: 0.8244 (0.8716)  grad_norm: 0.5838 (0.8319)  time: 5.5642  data: 0.0002  max mem: 71357
+[19:18:52.636548] Epoch: [0]  [1240/6500]  lr: 0.000010  closs: 0.8378 (0.8712)  grad_norm: 0.6067 (0.8326)  time: 5.5698  data: 0.0001  max mem: 71357
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use).
+[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use).
+[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
+Traceback (most recent call last):
+  File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in <module>
+    sys.exit(main())
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
+    run(args)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
+    result = agent.run()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
+    result = self._invoke_run(role)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run
+    self._initialize_workers(self._worker_group)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers
+    self._rendezvous(worker_group)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
+    result = f(*args, **kwargs)
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous
+    store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
+  File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous
+    self._store = TCPStore(  # type: ignore[call-arg]
+RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use).
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 0): env://, gpu 0
+[19:19:28.076271] > initializing model parallel with size 1
+[19:19:28.076336] > initializing ddp with size 2
+[19:19:28.076342] > initializing pipeline with size 1
+[19:19:28.121707] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[19:19:28.121800] Namespace(batch_size=4,
+accum_iter=2,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-70b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B',
+log_dir='./output_dir',
+save_interval=1,
+device='cuda',
+seed=0,
+resume='',
+num_workers=8,
+pin_mem=True,
+world_size=2,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[19:19:28.133114] Model Args:
+ ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[19:19:48.469330] Epoch: [0]  [1250/6500]  lr: 0.000010  closs: 0.7255 (0.8696)  grad_norm: 0.6067 (0.8306)  time: 5.5775  data: 0.0002  max mem: 71357
+[19:20:44.159763] Epoch: [0]  [1260/6500]  lr: 0.000010  closs: 0.7255 (0.8691)  grad_norm: 0.6007 (0.8286)  time: 5.5761  data: 0.0002  max mem: 71357
+[19:21:39.944680] Epoch: [0]  [1270/6500]  lr: 0.000010  closs: 0.8232 (0.8687)  grad_norm: 0.5559 (0.8273)  time: 5.5737  data: 0.0002  max mem: 71357
+[19:22:35.727072] Epoch: [0]  [1280/6500]  lr: 0.000010  closs: 0.7722 (0.8681)  grad_norm: 0.6128 (0.8262)  time: 5.5782  data: 0.0002  max mem: 71357
+[19:23:31.545412] Epoch: [0]  [1290/6500]  lr: 0.000010  closs: 0.7420 (0.8668)  grad_norm: 0.5559 (0.8239)  time: 5.5799  data: 0.0002  max mem: 71357
+[19:24:27.267608] Epoch: [0]  [1300/6500]  lr: 0.000010  closs: 0.7926 (0.8669)  grad_norm: 0.6245 (0.8228)  time: 5.5769  data: 0.0002  max mem: 71357
+[19:25:23.128280] Epoch: [0]  [1310/6500]  lr: 0.000010  closs: 0.7852 (0.8662)  grad_norm: 0.6224 (0.8206)  time: 5.5791  data: 0.0002  max mem: 71357
+[19:26:18.848671] Epoch: [0]  [1320/6500]  lr: 0.000010  closs: 0.7704 (0.8660)  grad_norm: 0.5277 (0.8185)  time: 5.5789  data: 0.0002  max mem: 71357
+[19:27:14.498952] Epoch: [0]  [1330/6500]  lr: 0.000010  closs: 0.8118 (0.8657)  grad_norm: 0.5535 (0.8175)  time: 5.5684  data: 0.0002  max mem: 71357
+[19:27:29.243249] Model is Peft: True
+[19:27:29.250590] Trainable parameter count : 8036352 (local rank), 8036352 (all).
+[19:27:29.274209] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274240] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274253] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.274266] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274276] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274288] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274299] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274310] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274320] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274333] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274343] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274355] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274365] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274376] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274386] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274397] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274408] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274423] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274435] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.274446] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274456] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274468] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274478] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274489] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274499] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274511] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274521] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274532] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274542] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274553] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274563] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274574] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274585] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274599] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274609] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.274620] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274630] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274641] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274651] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274662] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274672] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274684] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274694] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274705] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274715] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274726] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274736] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274748] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274759] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274772] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274782] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.274793] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274803] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274814] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274824] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274835] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274845] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274857] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274867] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274878] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274888] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274900] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274909] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.274921] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274931] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.274945] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274954] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.274966] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274976] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.274987] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.274997] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275008] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275018] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275030] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275040] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275052] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275063] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275075] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275085] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275096] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275107] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275120] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275130] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.275142] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275151] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275163] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275172] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275184] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275194] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275206] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275215] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275227] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275237] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275248] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275258] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275269] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275282] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275295] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275305] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.275316] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275326] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275338] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275347] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275359] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275368] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275380] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275390] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275401] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275411] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275423] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275432] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275443] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275454] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275467] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275477] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.275488] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275498] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275509] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275519] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275531] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275540] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275553] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275563] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275574] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275584] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275595] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275605] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275616] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275627] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275640] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275650] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.275661] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275675] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275692] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275703] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275714] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275724] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275736] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275746] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275758] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275768] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275779] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275789] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275800] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275812] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275825] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275835] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.275846] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275856] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275868] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275878] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.275889] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275899] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275911] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275921] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275932] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275942] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275954] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.275963] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.275975] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275986] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.275999] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276009] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276021] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276030] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276042] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276051] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276063] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276073] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276085] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276095] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276106] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276116] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276127] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276137] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276148] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276159] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276172] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276182] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276194] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276203] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276215] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276224] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276236] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276245] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276257] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276267] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276278] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276288] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276299] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276309] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276320] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276331] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276344] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276354] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276366] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276375] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276387] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276396] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276408] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276417] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276430] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276439] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276451] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276460] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276472] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276481] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276493] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276504] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276516] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276526] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276538] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276547] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276559] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276568] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276580] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276590] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276602] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276611] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276623] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276633] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276644] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276654] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276665] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276676] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276689] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276699] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276710] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276720] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276731] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276741] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276752] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276762] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276774] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276784] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276795] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276805] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276816] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276826] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276837] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276848] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276861] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276871] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.276882] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276892] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276903] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276913] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.276924] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276934] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276947] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276956] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.276968] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276977] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.276989] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.276998] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277009] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277020] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277033] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277043] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277055] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277065] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277076] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277086] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277097] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277107] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277119] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277129] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277140] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277150] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277161] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277171] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277182] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277193] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277206] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277216] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277227] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277237] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277248] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277258] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277269] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277279] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277291] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277301] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277312] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277322] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277333] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277343] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277355] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277365] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277378] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277388] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277400] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277409] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277421] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277431] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277442] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277452] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277464] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277474] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277485] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277495] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277506] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277521] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277532] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277543] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277556] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277570] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277581] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277591] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277603] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277613] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277624] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277634] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277646] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277656] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277667] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277677] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277688] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277698] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277709] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277720] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277733] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277743] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277754] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277764] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277775] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277785] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277796] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277806] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277818] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277828] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277839] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277849] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277861] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277870] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.277881] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277892] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277905] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277915] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.277926] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277936] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277947] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277957] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.277968] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.277978] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.277990] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278000] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278011] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278021] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278032] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278042] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278053] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278064] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278077] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278086] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278098] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278107] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278118] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278128] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278139] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278149] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278161] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278171] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278182] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278192] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278203] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278213] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278224] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278235] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278248] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278258] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278269] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278279] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278290] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278300] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278311] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278321] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278333] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278343] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278354] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278364] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278375] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278385] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278396] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278407] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278420] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278429] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278441] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278451] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278462] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278472] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278483] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278493] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278505] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278515] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278526] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278536] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278547] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278557] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278568] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278579] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278595] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278605] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278616] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278626] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278637] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278647] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278658] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278668] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278680] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278690] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278701] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278711] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278722] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278732] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278743] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278754] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278767] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278776] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278788] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278797] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278809] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278819] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278830] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278840] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278851] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278861] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278872] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278882] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278893] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278903] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.278915] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278925] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.278938] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278948] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.278959] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278969] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.278981] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.278990] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279002] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279012] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279024] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279034] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279045] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279055] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279066] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279076] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279087] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279098] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279110] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279120] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279131] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279141] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279153] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279163] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279174] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279184] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279196] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279206] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279217] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279227] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279238] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279248] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279259] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279270] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279283] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279293] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279304] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279314] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279325] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279335] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279346] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279356] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279368] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279378] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279389] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279399] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279410] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279420] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279431] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279442] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279455] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279465] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279476] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279486] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279497] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279507] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279518] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279527] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279539] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279549] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279561] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279570] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279582] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279591] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279603] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279613] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279626] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279636] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279648] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279657] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279669] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279678] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279690] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279700] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279712] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279722] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279733] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279743] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279754] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279764] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279775] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279786] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279799] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279809] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279821] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279830] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279842] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279852] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.279863] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279873] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279885] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279895] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279906] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279916] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279927] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279937] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.279948] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279959] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.279972] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.279982] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.279993] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280003] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280014] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280024] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280035] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280045] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280057] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280067] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280079] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280088] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280099] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280109] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280120] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280131] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280144] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280154] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.280165] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280175] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280186] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280196] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280207] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280217] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280229] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280239] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280250] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280260] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280271] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280281] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280293] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280304] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280316] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280326] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.280338] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280348] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280359] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280369] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280380] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280389] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280401] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280411] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280423] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280432] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280443] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280453] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280464] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280475] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280488] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280498] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.280509] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280519] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280530] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280540] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280551] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280561] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280573] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280583] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280594] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280604] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280615] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280625] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280636] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280647] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280660] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280669] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.280681] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280691] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280702] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280712] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280723] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280733] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280745] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280755] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280766] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280776] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280787] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280797] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280808] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280818] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280830] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280839] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.280849] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280858] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280868] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280877] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.280888] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280897] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280908] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280917] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280932] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280942] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280952] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.280961] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.280971] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280981] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.280993] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281002] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281013] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281022] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281032] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281041] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281051] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281060] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281072] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281081] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281093] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281102] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281112] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281121] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281131] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281141] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281153] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281162] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281172] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281181] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281192] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281201] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281211] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281220] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281231] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281240] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281250] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281259] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281270] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281279] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281289] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281299] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281311] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281320] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281330] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281339] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281349] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281359] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281370] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281379] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281390] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281399] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281409] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281418] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281428] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281437] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281448] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281458] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281470] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281479] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281489] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281498] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281508] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281521] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281531] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281540] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281551] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281560] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281571] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281580] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281590] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281599] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281609] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281620] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281632] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281641] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281652] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281661] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281671] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281680] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281690] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281699] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281710] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281719] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281730] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281739] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281749] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281758] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281769] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281779] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281791] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281800] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281810] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281819] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281829] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281838] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281848] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281857] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281869] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281878] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281888] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281897] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281908] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281916] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.281927] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281936] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.281948] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281957] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.281967] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281976] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.281987] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.281996] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282006] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282015] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282026] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282035] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282045] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282054] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282065] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282073] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282084] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282094] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282106] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282115] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.282125] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282134] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282144] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282153] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282164] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282173] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282184] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282193] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282203] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282212] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282223] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282232] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282242] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282253] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282272] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282286] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.282302] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282316] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282333] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282347] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282363] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282378] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282396] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282411] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282428] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282443] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282460] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282475] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282492] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282509] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282530] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282546] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.282564] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282579] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282597] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282613] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282630] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282645] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282665] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282681] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282699] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282714] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282732] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282747] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282766] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282783] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282805] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282821] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.282839] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282854] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282871] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282885] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.282901] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282916] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282934] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282949] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.282966] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.282980] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.282997] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283012] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283028] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283043] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283063] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283078] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.283093] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283107] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283123] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283138] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283155] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283170] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283188] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283203] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283221] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283237] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283254] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283270] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283288] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283306] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283327] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283343] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.283362] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283377] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283395] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283410] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283428] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283444] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283464] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283479] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283498] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283514] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283532] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283548] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283566] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283583] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283605] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283621] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.283639] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283654] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283673] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283688] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283704] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283720] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283740] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283756] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283773] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283789] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283807] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283823] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.283841] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283859] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.283881] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283897] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.283915] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283931] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283949] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283964] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.283981] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.283996] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284014] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284029] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284047] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284062] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284079] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284094] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284111] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284128] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284153] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284169] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.284188] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284204] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284223] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284240] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284258] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284274] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284294] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284310] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284328] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284344] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284362] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284378] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284396] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284413] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284434] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284449] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.284467] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284482] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284501] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284518] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284537] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284555] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284575] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284592] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284611] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284628] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284647] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284663] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284681] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284701] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284725] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284742] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.284762] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284779] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284797] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284813] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.284832] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284847] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284868] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284884] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284903] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284920] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284939] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.284955] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.284974] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.284992] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285015] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285032] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.285050] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285067] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285084] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285100] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285117] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285132] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285151] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285166] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285184] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285199] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285216] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285231] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285247] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285263] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285282] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285297] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.285314] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285328] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285345] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285359] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285375] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285389] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285407] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285421] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285438] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285452] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285469] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285483] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285499] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285523] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285544] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285558] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.285574] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285589] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285607] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285623] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285641] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285657] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285677] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285693] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285711] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285727] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285745] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285761] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285777] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285794] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285815] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285830] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.285847] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285862] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285878] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285893] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.285910] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285925] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.285943] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285957] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.285974] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.285989] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286005] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286019] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286036] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286053] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286073] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286087] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.286104] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286119] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286135] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286149] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286166] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286181] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286200] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286215] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286232] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286247] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286264] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286280] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286297] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286314] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286335] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286350] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.286368] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286383] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286401] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286417] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286436] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286451] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286470] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286486] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286503] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286519] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286536] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286550] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286567] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286584] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286605] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286620] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.286638] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286653] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286671] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286686] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286704] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286720] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286739] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286753] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286769] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286783] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286801] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286815] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.286833] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286851] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.286874] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286889] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.286907] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286922] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286939] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286954] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.286971] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.286985] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287004] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287019] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287036] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287051] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287067] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287082] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287099] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287115] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287136] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287151] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.287168] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287183] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287199] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287214] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287230] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287245] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287263] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287277] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287293] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287308] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287324] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287338] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287354] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287369] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287389] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287404] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.287421] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287435] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287452] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287467] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287484] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287499] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287518] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287532] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287549] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287564] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287581] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287596] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287613] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287629] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287649] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287663] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.287680] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287695] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287711] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287725] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287742] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287757] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287775] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287788] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287804] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287819] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287838] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287854] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.287873] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287892] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.287914] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287931] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.287950] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.287965] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.287984] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288000] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288018] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288033] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288053] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288069] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288088] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288104] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288122] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288138] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288156] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288173] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288194] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288210] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.288226] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288241] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288258] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288272] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288289] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288303] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288322] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288337] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288355] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288369] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288387] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288402] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288419] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288436] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288457] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288472] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.288489] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288504] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288521] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288535] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288551] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288565] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288583] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288598] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288615] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288630] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288649] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288664] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288681] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288699] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288719] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288735] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.288752] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288767] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288785] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288800] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.288818] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288834] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288853] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288868] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288886] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288901] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288919] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.288934] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.288951] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288967] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.288989] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289005] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.289023] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289038] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289056] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289071] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289088] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289104] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289124] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289139] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289157] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289172] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289190] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289205] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289222] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289238] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289259] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289274] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.289292] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289307] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289324] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289339] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289356] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289370] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289389] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289404] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289421] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289436] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289452] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289467] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289483] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289499] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289525] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289541] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.289559] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289573] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289590] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289604] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289621] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289635] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289653] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289667] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289684] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289699] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289715] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289729] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289745] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289761] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289782] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289797] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.289815] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289830] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289847] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289862] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.289878] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289892] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289909] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289924] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.289942] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289957] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.289975] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.289991] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290009] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290026] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290048] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290063] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.290081] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290097] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290115] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290130] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290148] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290163] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290183] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290199] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290217] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290255] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290273] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290287] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290301] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290311] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290324] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290334] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.290345] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290354] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290364] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290373] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290384] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290393] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290405] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290414] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290425] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290434] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290444] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290453] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290463] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290473] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290485] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290495] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.290506] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290515] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290525] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290534] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290545] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290554] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290565] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290574] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290584] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290593] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290603] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290612] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290623] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290633] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290645] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290654] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32
+[19:27:29.290665] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290675] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290686] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290695] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32
+[19:27:29.290705] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290714] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290725] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290734] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290745] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290754] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290764] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290773] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32
+[19:27:29.290783] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290793] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290804] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32
+[19:27:29.290815] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16
+[19:27:29.290840] load pretrained from ../checkpoints/llama2/Llama-2-70b/
+[19:27:29.290845] Quantizing model to 4bit!
+[19:28:10.394107] Epoch: [0]  [1340/6500]  lr: 0.000010  closs: 0.8034 (0.8657)  grad_norm: 0.5445 (0.8165)  time: 5.5771  data: 0.0002  max mem: 71357
+[19:29:06.239565] Epoch: [0]  [1350/6500]  lr: 0.000010  closs: 0.8034 (0.8652)  grad_norm: 0.5277 (0.8142)  time: 5.5869  data: 0.0002  max mem: 71357
+[19:30:01.961975] Epoch: [0]  [1360/6500]  lr: 0.000010  closs: 0.7740 (0.8646)  grad_norm: 0.5703 (0.8128)  time: 5.5783  data: 0.0002  max mem: 71357
+[19:30:57.763309] Epoch: [0]  [1370/6500]  lr: 0.000011  closs: 0.7126 (0.8635)  grad_norm: 0.5703 (0.8116)  time: 5.5761  data: 0.0002  max mem: 71357
+[19:31:53.564694] Epoch: [0]  [1380/6500]  lr: 0.000011  closs: 0.7537 (0.8634)  grad_norm: 0.5703 (0.8103)  time: 5.5800  data: 0.0001  max mem: 71357
+[19:32:49.286247] Epoch: [0]  [1390/6500]  lr: 0.000011  closs: 0.7631 (0.8624)  grad_norm: 0.6007 (0.8092)  time: 5.5761  data: 0.0001  max mem: 71357
+[19:33:45.056994] Epoch: [0]  [1400/6500]  lr: 0.000011  closs: 0.8355 (0.8627)  grad_norm: 0.5855 (0.8086)  time: 5.5745  data: 0.0001  max mem: 71357
+[19:34:40.796559] Epoch: [0]  [1410/6500]  lr: 0.000011  closs: 0.8545 (0.8626)  grad_norm: 0.5738 (0.8066)  time: 5.5754  data: 0.0002  max mem: 71357
+[19:35:36.566183] Epoch: [0]  [1420/6500]  lr: 0.000011  closs: 0.7831 (0.8623)  grad_norm: 0.4588 (0.8040)  time: 5.5753  data: 0.0002  max mem: 71357
+[19:36:32.355594] Epoch: [0]  [1430/6500]  lr: 0.000011  closs: 0.8227 (0.8622)  grad_norm: 0.5588 (0.8042)  time: 5.5778  data: 0.0001  max mem: 71357
+[19:37:28.095893] Epoch: [0]  [1440/6500]  lr: 0.000011  closs: 0.8106 (0.8617)  grad_norm: 0.5302 (0.8031)  time: 5.5764  data: 0.0001  max mem: 71357
+[19:38:23.857545] Epoch: [0]  [1450/6500]  lr: 0.000011  closs: 0.7735 (0.8613)  grad_norm: 0.5302 (0.8011)  time: 5.5750  data: 0.0001  max mem: 71357
+[19:39:19.647799] Epoch: [0]  [1460/6500]  lr: 0.000011  closs: 0.8252 (0.8612)  grad_norm: 0.5502 (0.8047)  time: 5.5775  data: 0.0001  max mem: 71357
+[19:40:15.482957] Epoch: [0]  [1470/6500]  lr: 0.000011  closs: 0.8374 (0.8608)  grad_norm: 0.5301 (0.8027)  time: 5.5811  data: 0.0002  max mem: 71357
+[19:41:11.141116] Epoch: [0]  [1480/6500]  lr: 0.000011  closs: 0.8001 (0.8606)  grad_norm: 0.5333 (0.8027)  time: 5.5745  data: 0.0002  max mem: 71357
+[19:42:06.984066] Epoch: [0]  [1490/6500]  lr: 0.000011  closs: 0.7398 (0.8594)  grad_norm: 0.5333 (0.8013)  time: 5.5749  data: 0.0001  max mem: 71357
+[19:43:02.746693] Epoch: [0]  [1500/6500]  lr: 0.000012  closs: 0.7381 (0.8590)  grad_norm: 0.5309 (0.8012)  time: 5.5802  data: 0.0002  max mem: 71357
+[19:43:58.570975] Epoch: [0]  [1510/6500]  lr: 0.000012  closs: 0.8013 (0.8588)  grad_norm: 0.4857 (0.7988)  time: 5.5793  data: 0.0001  max mem: 71357
+[19:44:54.319260] Epoch: [0]  [1520/6500]  lr: 0.000012  closs: 0.8824 (0.8588)  grad_norm: 0.5054 (0.7980)  time: 5.5786  data: 0.0002  max mem: 71357
+[19:45:50.069233] Epoch: [0]  [1530/6500]  lr: 0.000012  closs: 0.8063 (0.8585)  grad_norm: 0.4937 (0.7963)  time: 5.5748  data: 0.0001  max mem: 71357
+[19:46:45.742778] Epoch: [0]  [1540/6500]  lr: 0.000012  closs: 0.7936 (0.8579)  grad_norm: 0.5091 (0.7950)  time: 5.5711  data: 0.0001  max mem: 71357
+[19:47:41.512558] Epoch: [0]  [1550/6500]  lr: 0.000012  closs: 0.7926 (0.8576)  grad_norm: 0.5252 (0.7939)  time: 5.5720  data: 0.0001  max mem: 71357
+[19:48:37.387361] Epoch: [0]  [1560/6500]  lr: 0.000012  closs: 0.7592 (0.8569)  grad_norm: 0.4907 (0.7924)  time: 5.5821  data: 0.0001  max mem: 71357
+[19:49:33.103590] Epoch: [0]  [1570/6500]  lr: 0.000012  closs: 0.7427 (0.8565)  grad_norm: 0.5252 (0.7923)  time: 5.5794  data: 0.0001  max mem: 71357
+[19:50:28.953920] Epoch: [0]  [1580/6500]  lr: 0.000012  closs: 0.7489 (0.8559)  grad_norm: 0.4674 (0.7904)  time: 5.5783  data: 0.0001  max mem: 71357
+[19:51:24.796572] Epoch: [0]  [1590/6500]  lr: 0.000012  closs: 0.7778 (0.8552)  grad_norm: 0.4821 (0.7886)  time: 5.5845  data: 0.0002  max mem: 71357
+[19:52:20.573169] Epoch: [0]  [1600/6500]  lr: 0.000012  closs: 0.8162 (0.8553)  grad_norm: 0.5151 (0.7882)  time: 5.5808  data: 0.0002  max mem: 71357
+[19:53:16.418950] Epoch: [0]  [1610/6500]  lr: 0.000012  closs: 0.8162 (0.8549)  grad_norm: 0.5045 (0.7865)  time: 5.5810  data: 0.0001  max mem: 71357
+[19:54:12.129422] Epoch: [0]  [1620/6500]  lr: 0.000012  closs: 0.7935 (0.8548)  grad_norm: 0.5045 (0.7844)  time: 5.5777  data: 0.0001  max mem: 71357
+[19:55:07.940492] Epoch: [0]  [1630/6500]  lr: 0.000013  closs: 0.8191 (0.8547)  grad_norm: 0.5045 (0.7841)  time: 5.5760  data: 0.0001  max mem: 71357
+[19:56:03.688378] Epoch: [0]  [1640/6500]  lr: 0.000013  closs: 0.8281 (0.8546)  grad_norm: 0.4828 (0.7837)  time: 5.5779  data: 0.0001  max mem: 71357
+[19:56:59.568279] Epoch: [0]  [1650/6500]  lr: 0.000013  closs: 0.7958 (0.8542)  grad_norm: 0.4776 (0.7821)  time: 5.5813  data: 0.0001  max mem: 71357
+[19:57:55.339672] Epoch: [0]  [1660/6500]  lr: 0.000013  closs: 0.7472 (0.8538)  grad_norm: 0.6027 (0.7811)  time: 5.5825  data: 0.0002  max mem: 71357
+[19:58:50.972681] Epoch: [0]  [1670/6500]  lr: 0.000013  closs: 0.7411 (0.8532)  grad_norm: 0.5621 (0.7802)  time: 5.5701  data: 0.0002  max mem: 71357
+[19:59:46.690981] Epoch: [0]  [1680/6500]  lr: 0.000013  closs: 0.7657 (0.8525)  grad_norm: 0.5570 (0.7789)  time: 5.5674  data: 0.0002  max mem: 71357
+[20:00:42.465398] Epoch: [0]  [1690/6500]  lr: 0.000013  closs: 0.6785 (0.8513)  grad_norm: 0.5621 (0.7781)  time: 5.5745  data: 0.0002  max mem: 71357
+[20:01:38.307864] Epoch: [0]  [1700/6500]  lr: 0.000013  closs: 0.6685 (0.8507)  grad_norm: 0.5087 (0.7765)  time: 5.5808  data: 0.0001  max mem: 71357
+[20:02:34.069065] Epoch: [0]  [1710/6500]  lr: 0.000013  closs: 0.7515 (0.8502)  grad_norm: 0.5087 (0.7756)  time: 5.5801  data: 0.0001  max mem: 71357
+[20:03:29.815804] Epoch: [0]  [1720/6500]  lr: 0.000013  closs: 0.7740 (0.8501)  grad_norm: 0.5406 (0.7768)  time: 5.5753  data: 0.0001  max mem: 71357
+[20:04:25.658015] Epoch: [0]  [1730/6500]  lr: 0.000013  closs: 0.7883 (0.8499)  grad_norm: 0.5054 (0.7752)  time: 5.5793  data: 0.0001  max mem: 71357
+[20:05:21.448684] Epoch: [0]  [1740/6500]  lr: 0.000013  closs: 0.7883 (0.8497)  grad_norm: 0.5406 (0.7747)  time: 5.5815  data: 0.0002  max mem: 71357
+[20:06:17.132635] Epoch: [0]  [1750/6500]  lr: 0.000013  closs: 0.7562 (0.8491)  grad_norm: 0.5160 (0.7736)  time: 5.5737  data: 0.0002  max mem: 71357
+[20:07:12.915410] Epoch: [0]  [1760/6500]  lr: 0.000014  closs: 0.8120 (0.8491)  grad_norm: 0.5160 (0.7725)  time: 5.5733  data: 0.0001  max mem: 71357
+[20:08:08.564010] Epoch: [0]  [1770/6500]  lr: 0.000014  closs: 0.9002 (0.8495)  grad_norm: 0.6313 (0.7726)  time: 5.5715  data: 0.0001  max mem: 71357
+[20:09:04.351531] Epoch: [0]  [1780/6500]  lr: 0.000014  closs: 0.9002 (0.8491)  grad_norm: 0.5656 (0.7711)  time: 5.5717  data: 0.0001  max mem: 71357
+[20:10:00.035219] Epoch: [0]  [1790/6500]  lr: 0.000014  closs: 0.7180 (0.8488)  grad_norm: 0.6461 (0.7707)  time: 5.5734  data: 0.0002  max mem: 71357
+[20:10:55.870427] Epoch: [0]  [1800/6500]  lr: 0.000014  closs: 0.7484 (0.8481)  grad_norm: 0.5875 (0.7696)  time: 5.5758  data: 0.0002  max mem: 71357
+[20:11:51.589061] Epoch: [0]  [1810/6500]  lr: 0.000014  closs: 0.7187 (0.8474)  grad_norm: 0.5284 (0.7691)  time: 5.5776  data: 0.0001  max mem: 71357
+[20:12:47.419446] Epoch: [0]  [1820/6500]  lr: 0.000014  closs: 0.7622 (0.8475)  grad_norm: 0.5546 (0.7679)  time: 5.5774  data: 0.0002  max mem: 71357
+[20:13:43.152888] Epoch: [0]  [1830/6500]  lr: 0.000014  closs: 0.8333 (0.8472)  grad_norm: 0.5013 (0.7662)  time: 5.5781  data: 0.0002  max mem: 71357
+[20:14:38.893836] Epoch: [0]  [1840/6500]  lr: 0.000014  closs: 0.8029 (0.8472)  grad_norm: 0.5110 (0.7648)  time: 5.5736  data: 0.0002  max mem: 71357
+[20:15:34.630747] Epoch: [0]  [1850/6500]  lr: 0.000014  closs: 0.8166 (0.8470)  grad_norm: 0.4765 (0.7632)  time: 5.5738  data: 0.0001  max mem: 71357
+[20:16:30.314557] Epoch: [0]  [1860/6500]  lr: 0.000014  closs: 0.8166 (0.8469)  grad_norm: 0.4707 (0.7628)  time: 5.5709  data: 0.0001  max mem: 71357
+[20:17:26.194852] Epoch: [0]  [1870/6500]  lr: 0.000014  closs: 0.8289 (0.8467)  grad_norm: 0.4765 (0.7619)  time: 5.5781  data: 0.0001  max mem: 71357
+[20:18:21.871034] Epoch: [0]  [1880/6500]  lr: 0.000014  closs: 0.7967 (0.8464)  grad_norm: 0.5001 (0.7617)  time: 5.5778  data: 0.0001  max mem: 71357
+[20:19:17.591186] Epoch: [0]  [1890/6500]  lr: 0.000015  closs: 0.7326 (0.8458)  grad_norm: 0.6081 (0.7608)  time: 5.5697  data: 0.0001  max mem: 71357
+[20:20:13.289294] Epoch: [0]  [1900/6500]  lr: 0.000015  closs: 0.7281 (0.8450)  grad_norm: 0.5038 (0.7597)  time: 5.5708  data: 0.0002  max mem: 71357
+[20:21:09.177977] Epoch: [0]  [1910/6500]  lr: 0.000015  closs: 0.7328 (0.8453)  grad_norm: 0.5284 (0.7604)  time: 5.5792  data: 0.0002  max mem: 71357
+[20:22:04.997084] Epoch: [0]  [1920/6500]  lr: 0.000015  closs: 0.8228 (0.8450)  grad_norm: 0.5284 (0.7592)  time: 5.5853  data: 0.0001  max mem: 71357
+[20:23:00.755724] Epoch: [0]  [1930/6500]  lr: 0.000015  closs: 0.8124 (0.8447)  grad_norm: 0.5513 (0.7585)  time: 5.5788  data: 0.0001  max mem: 71357
+[20:23:56.446255] Epoch: [0]  [1940/6500]  lr: 0.000015  closs: 0.8124 (0.8448)  grad_norm: 0.5656 (0.7573)  time: 5.5723  data: 0.0001  max mem: 71357
+[20:24:52.260103] Epoch: [0]  [1950/6500]  lr: 0.000015  closs: 0.7912 (0.8445)  grad_norm: 0.5052 (0.7558)  time: 5.5751  data: 0.0001  max mem: 71357
+[20:25:47.913210] Epoch: [0]  [1960/6500]  lr: 0.000015  closs: 0.7160 (0.8440)  grad_norm: 0.5052 (0.7555)  time: 5.5733  data: 0.0002  max mem: 71357
+[20:26:43.623002] Epoch: [0]  [1970/6500]  lr: 0.000015  closs: 0.7569 (0.8438)  grad_norm: 0.5321 (0.7545)  time: 5.5681  data: 0.0002  max mem: 71357
+[20:27:39.403420] Epoch: [0]  [1980/6500]  lr: 0.000015  closs: 0.8105 (0.8436)  grad_norm: 0.5093 (0.7539)  time: 5.5744  data: 0.0002  max mem: 71357
+[20:28:35.232297] Epoch: [0]  [1990/6500]  lr: 0.000015  closs: 0.7956 (0.8437)  grad_norm: 0.5200 (0.7528)  time: 5.5804  data: 0.0001  max mem: 71357
+[20:29:30.942620] Epoch: [0]  [2000/6500]  lr: 0.000015  closs: 0.7954 (0.8438)  grad_norm: 0.5200 (0.7519)  time: 5.5769  data: 0.0001  max mem: 71357
+[20:30:26.631297] Epoch: [0]  [2010/6500]  lr: 0.000015  closs: 0.7865 (0.8435)  grad_norm: 0.4947 (0.7508)  time: 5.5698  data: 0.0001  max mem: 71357
+[20:31:22.375631] Epoch: [0]  [2020/6500]  lr: 0.000016  closs: 0.7190 (0.8429)  grad_norm: 0.4947 (0.7500)  time: 5.5715  data: 0.0002  max mem: 71357
+[20:32:18.160803] Epoch: [0]  [2030/6500]  lr: 0.000016  closs: 0.6794 (0.8426)  grad_norm: 0.5117 (0.7508)  time: 5.5763  data: 0.0002  max mem: 71357
+[20:33:14.032422] Epoch: [0]  [2040/6500]  lr: 0.000016  closs: 0.6984 (0.8418)  grad_norm: 0.5095 (0.7495)  time: 5.5827  data: 0.0001  max mem: 71357
+[20:34:09.674229] Epoch: [0]  [2050/6500]  lr: 0.000016  closs: 0.7023 (0.8413)  grad_norm: 0.5421 (0.7498)  time: 5.5756  data: 0.0001  max mem: 71357
+[20:35:05.378186] Epoch: [0]  [2060/6500]  lr: 0.000016  closs: 0.7354 (0.8410)  grad_norm: 0.6405 (0.7500)  time: 5.5672  data: 0.0002  max mem: 71357
+[20:36:01.171645] Epoch: [0]  [2070/6500]  lr: 0.000016  closs: 0.7757 (0.8410)  grad_norm: 0.5421 (0.7489)  time: 5.5747  data: 0.0002  max mem: 71357
+[20:36:57.047870] Epoch: [0]  [2080/6500]  lr: 0.000016  closs: 0.8422 (0.8410)  grad_norm: 0.5457 (0.7477)  time: 5.5834  data: 0.0002  max mem: 71357
+[20:37:52.852566] Epoch: [0]  [2090/6500]  lr: 0.000016  closs: 0.7849 (0.8408)  grad_norm: 0.5457 (0.7474)  time: 5.5840  data: 0.0001  max mem: 71357
+[20:38:48.604842] Epoch: [0]  [2100/6500]  lr: 0.000016  closs: 0.7323 (0.8402)  grad_norm: 0.4892 (0.7468)  time: 5.5778  data: 0.0001  max mem: 71357
+[20:39:44.382964] Epoch: [0]  [2110/6500]  lr: 0.000016  closs: 0.7806 (0.8405)  grad_norm: 0.4828 (0.7459)  time: 5.5764  data: 0.0002  max mem: 71357
+[20:40:40.183395] Epoch: [0]  [2120/6500]  lr: 0.000016  closs: 0.8687 (0.8406)  grad_norm: 0.4828 (0.7446)  time: 5.5788  data: 0.0001  max mem: 71357
+[20:41:35.950265] Epoch: [0]  [2130/6500]  lr: 0.000016  closs: 0.7655 (0.8401)  grad_norm: 0.4828 (0.7435)  time: 5.5782  data: 0.0001  max mem: 71357
+[20:42:31.699836] Epoch: [0]  [2140/6500]  lr: 0.000016  closs: 0.7670 (0.8399)  grad_norm: 0.4762 (0.7425)  time: 5.5757  data: 0.0001  max mem: 71357
+[20:43:27.317712] Epoch: [0]  [2150/6500]  lr: 0.000017  closs: 0.7810 (0.8397)  grad_norm: 0.5027 (0.7424)  time: 5.5683  data: 0.0001  max mem: 71357
+[20:44:23.007754] Epoch: [0]  [2160/6500]  lr: 0.000017  closs: 0.7664 (0.8397)  grad_norm: 0.5038 (0.7418)  time: 5.5653  data: 0.0001  max mem: 71357
+[20:45:18.802104] Epoch: [0]  [2170/6500]  lr: 0.000017  closs: 0.7484 (0.8394)  grad_norm: 0.5949 (0.7415)  time: 5.5741  data: 0.0002  max mem: 71357
+[20:46:14.517863] Epoch: [0]  [2180/6500]  lr: 0.000017  closs: 0.7521 (0.8392)  grad_norm: 0.5192 (0.7403)  time: 5.5754  data: 0.0002  max mem: 71357
+[20:47:10.200594] Epoch: [0]  [2190/6500]  lr: 0.000017  closs: 0.7133 (0.8389)  grad_norm: 0.5528 (0.7399)  time: 5.5698  data: 0.0001  max mem: 71357
+[20:48:05.878498] Epoch: [0]  [2200/6500]  lr: 0.000017  closs: 0.7301 (0.8386)  grad_norm: 0.5528 (0.7390)  time: 5.5679  data: 0.0002  max mem: 71357
+[20:49:01.610047] Epoch: [0]  [2210/6500]  lr: 0.000017  closs: 0.7366 (0.8381)  grad_norm: 0.5171 (0.7379)  time: 5.5704  data: 0.0001  max mem: 71357
+[20:49:57.481127] Epoch: [0]  [2220/6500]  lr: 0.000017  closs: 0.7101 (0.8375)  grad_norm: 0.5008 (0.7367)  time: 5.5800  data: 0.0002  max mem: 71357
+[20:50:53.173124] Epoch: [0]  [2230/6500]  lr: 0.000017  closs: 0.7101 (0.8372)  grad_norm: 0.4889 (0.7370)  time: 5.5780  data: 0.0002  max mem: 71357
+[20:51:48.892801] Epoch: [0]  [2240/6500]  lr: 0.000017  closs: 0.7136 (0.8366)  grad_norm: 0.4889 (0.7363)  time: 5.5704  data: 0.0002  max mem: 71357
+[20:52:44.665693] Epoch: [0]  [2250/6500]  lr: 0.000017  closs: 0.7372 (0.8365)  grad_norm: 0.5008 (0.7355)  time: 5.5745  data: 0.0002  max mem: 71357
+[20:53:40.495845] Epoch: [0]  [2260/6500]  lr: 0.000017  closs: 0.7952 (0.8366)  grad_norm: 0.5428 (0.7346)  time: 5.5800  data: 0.0002  max mem: 71357
+[20:54:36.203059] Epoch: [0]  [2270/6500]  lr: 0.000017  closs: 0.7883 (0.8365)  grad_norm: 0.5428 (0.7345)  time: 5.5768  data: 0.0002  max mem: 71357
+[20:55:31.918928] Epoch: [0]  [2280/6500]  lr: 0.000018  closs: 0.7588 (0.8359)  grad_norm: 0.4861 (0.7333)  time: 5.5711  data: 0.0002  max mem: 71357
+[20:56:27.730546] Epoch: [0]  [2290/6500]  lr: 0.000018  closs: 0.7458 (0.8357)  grad_norm: 0.4716 (0.7331)  time: 5.5763  data: 0.0002  max mem: 71357
+[20:57:23.453980] Epoch: [0]  [2300/6500]  lr: 0.000018  closs: 0.7519 (0.8351)  grad_norm: 0.4951 (0.7329)  time: 5.5767  data: 0.0002  max mem: 71357
+[20:58:19.330645] Epoch: [0]  [2310/6500]  lr: 0.000018  closs: 0.7519 (0.8351)  grad_norm: 0.4951 (0.7318)  time: 5.5799  data: 0.0001  max mem: 71357
+[20:59:15.049870] Epoch: [0]  [2320/6500]  lr: 0.000018  closs: 0.7585 (0.8348)  grad_norm: 0.5165 (0.7310)  time: 5.5797  data: 0.0001  max mem: 71357
+[21:00:10.840941] Epoch: [0]  [2330/6500]  lr: 0.000018  closs: 0.7507 (0.8345)  grad_norm: 0.5165 (0.7304)  time: 5.5754  data: 0.0001  max mem: 71357
+[21:01:06.575722] Epoch: [0]  [2340/6500]  lr: 0.000018  closs: 0.7689 (0.8342)  grad_norm: 0.4833 (0.7296)  time: 5.5762  data: 0.0002  max mem: 71357
+[21:02:02.422716] Epoch: [0]  [2350/6500]  lr: 0.000018  closs: 0.7870 (0.8342)  grad_norm: 0.4735 (0.7284)  time: 5.5790  data: 0.0002  max mem: 71357
+[21:02:58.275189] Epoch: [0]  [2360/6500]  lr: 0.000018  closs: 0.8006 (0.8342)  grad_norm: 0.4750 (0.7274)  time: 5.5848  data: 0.0001  max mem: 71357
+[21:03:54.007925] Epoch: [0]  [2370/6500]  lr: 0.000018  closs: 0.7329 (0.8334)  grad_norm: 0.4833 (0.7271)  time: 5.5791  data: 0.0001  max mem: 71357
+[21:04:49.752785] Epoch: [0]  [2380/6500]  lr: 0.000018  closs: 0.7316 (0.8335)  grad_norm: 0.4943 (0.7266)  time: 5.5738  data: 0.0002  max mem: 71357
+[21:05:45.680885] Epoch: [0]  [2390/6500]  lr: 0.000018  closs: 0.8211 (0.8335)  grad_norm: 0.5098 (0.7261)  time: 5.5836  data: 0.0002  max mem: 71357
+[21:06:41.469027] Epoch: [0]  [2400/6500]  lr: 0.000018  closs: 0.8356 (0.8335)  grad_norm: 0.5608 (0.7252)  time: 5.5857  data: 0.0001  max mem: 71357
+[21:07:37.229547] Epoch: [0]  [2410/6500]  lr: 0.000019  closs: 0.7179 (0.8329)  grad_norm: 0.5037 (0.7249)  time: 5.5773  data: 0.0002  max mem: 71357
+[21:08:32.958561] Epoch: [0]  [2420/6500]  lr: 0.000019  closs: 0.7157 (0.8327)  grad_norm: 0.5037 (0.7241)  time: 5.5744  data: 0.0002  max mem: 71357
+[21:09:28.673873] Epoch: [0]  [2430/6500]  lr: 0.000019  closs: 0.7412 (0.8324)  grad_norm: 0.4534 (0.7229)  time: 5.5721  data: 0.0002  max mem: 71357
+[21:10:24.504516] Epoch: [0]  [2440/6500]  lr: 0.000019  closs: 0.8024 (0.8324)  grad_norm: 0.4475 (0.7218)  time: 5.5772  data: 0.0002  max mem: 71357
+[21:11:20.170214] Epoch: [0]  [2450/6500]  lr: 0.000019  closs: 0.7852 (0.8321)  grad_norm: 0.4525 (0.7210)  time: 5.5747  data: 0.0002  max mem: 71357
+[21:12:15.876408] Epoch: [0]  [2460/6500]  lr: 0.000019  closs: 0.7852 (0.8318)  grad_norm: 0.4266 (0.7201)  time: 5.5685  data: 0.0002  max mem: 71357
+[21:13:11.617462] Epoch: [0]  [2470/6500]  lr: 0.000019  closs: 0.8011 (0.8319)  grad_norm: 0.4346 (0.7192)  time: 5.5723  data: 0.0002  max mem: 71357
+[21:14:07.477465] Epoch: [0]  [2480/6500]  lr: 0.000019  closs: 0.7495 (0.8318)  grad_norm: 0.4759 (0.7184)  time: 5.5799  data: 0.0001  max mem: 71357
+[21:15:03.160431] Epoch: [0]  [2490/6500]  lr: 0.000019  closs: 0.7495 (0.8316)  grad_norm: 0.4835 (0.7178)  time: 5.5770  data: 0.0002  max mem: 71357
+[21:15:58.872968] Epoch: [0]  [2500/6500]  lr: 0.000019  closs: 0.7675 (0.8312)  grad_norm: 0.4726 (0.7166)  time: 5.5697  data: 0.0002  max mem: 71357
+[21:16:54.619091] Epoch: [0]  [2510/6500]  lr: 0.000019  closs: 0.7675 (0.8312)  grad_norm: 0.4883 (0.7158)  time: 5.5728  data: 0.0001  max mem: 71357
+[21:17:50.323182] Epoch: [0]  [2520/6500]  lr: 0.000019  closs: 0.8567 (0.8313)  grad_norm: 0.5139 (0.7170)  time: 5.5724  data: 0.0001  max mem: 71357
+[21:18:46.181308] Epoch: [0]  [2530/6500]  lr: 0.000019  closs: 0.8298 (0.8314)  grad_norm: 0.5134 (0.7165)  time: 5.5780  data: 0.0001  max mem: 71357
+[21:19:41.968783] Epoch: [0]  [2540/6500]  lr: 0.000020  closs: 0.8169 (0.8311)  grad_norm: 0.5156 (0.7158)  time: 5.5821  data: 0.0001  max mem: 71357
+[21:20:37.843995] Epoch: [0]  [2550/6500]  lr: 0.000020  closs: 0.8056 (0.8310)  grad_norm: 0.5134 (0.7148)  time: 5.5830  data: 0.0002  max mem: 71357
+[21:21:33.609151] Epoch: [0]  [2560/6500]  lr: 0.000020  closs: 0.7994 (0.8310)  grad_norm: 0.5011 (0.7142)  time: 5.5819  data: 0.0002  max mem: 71357
+[21:22:29.557411] Epoch: [0]  [2570/6500]  lr: 0.000020  closs: 0.7904 (0.8309)  grad_norm: 0.5156 (0.7135)  time: 5.5855  data: 0.0002  max mem: 71357
+[21:23:25.337398] Epoch: [0]  [2580/6500]  lr: 0.000020  closs: 0.7925 (0.8307)  grad_norm: 0.5411 (0.7129)  time: 5.5863  data: 0.0002  max mem: 71357
+[21:24:21.118584] Epoch: [0]  [2590/6500]  lr: 0.000020  closs: 0.7940 (0.8308)  grad_norm: 0.5367 (0.7119)  time: 5.5779  data: 0.0002  max mem: 71357
+[21:25:16.916564] Epoch: [0]  [2600/6500]  lr: 0.000020  closs: 0.7737 (0.8306)  grad_norm: 0.5209 (0.7115)  time: 5.5789  data: 0.0002  max mem: 71357
+[21:26:12.737112] Epoch: [0]  [2610/6500]  lr: 0.000020  closs: 0.7394 (0.8305)  grad_norm: 0.5209 (0.7118)  time: 5.5808  data: 0.0002  max mem: 71357
+[21:27:08.510919] Epoch: [0]  [2620/6500]  lr: 0.000020  closs: 0.7944 (0.8304)  grad_norm: 0.4956 (0.7111)  time: 5.5796  data: 0.0002  max mem: 71357
+[21:28:04.306907] Epoch: [0]  [2630/6500]  lr: 0.000020  closs: 0.7944 (0.8303)  grad_norm: 0.4956 (0.7102)  time: 5.5784  data: 0.0002  max mem: 71357
+[21:29:00.079713] Epoch: [0]  [2640/6500]  lr: 0.000020  closs: 0.7446 (0.8298)  grad_norm: 0.4956 (0.7094)  time: 5.5784  data: 0.0002  max mem: 71357
+[21:29:55.866717] Epoch: [0]  [2650/6500]  lr: 0.000020  closs: 0.7546 (0.8296)  grad_norm: 0.4603 (0.7084)  time: 5.5779  data: 0.0001  max mem: 71357
+[21:30:51.780482] Epoch: [0]  [2660/6500]  lr: 0.000020  closs: 0.8542 (0.8297)  grad_norm: 0.4237 (0.7076)  time: 5.5850  data: 0.0001  max mem: 71357
+[21:31:47.549029] Epoch: [0]  [2670/6500]  lr: 0.000021  closs: 0.7719 (0.8291)  grad_norm: 0.4430 (0.7072)  time: 5.5840  data: 0.0001  max mem: 71357
+[21:32:43.367174] Epoch: [0]  [2680/6500]  lr: 0.000021  closs: 0.7024 (0.8288)  grad_norm: 0.4734 (0.7067)  time: 5.5792  data: 0.0002  max mem: 71357
+[21:33:39.102598] Epoch: [0]  [2690/6500]  lr: 0.000021  closs: 0.7855 (0.8287)  grad_norm: 0.5127 (0.7059)  time: 5.5776  data: 0.0002  max mem: 71357
+[21:34:35.007085] Epoch: [0]  [2700/6500]  lr: 0.000021  closs: 0.7978 (0.8288)  grad_norm: 0.5239 (0.7054)  time: 5.5819  data: 0.0002  max mem: 71357
+[21:35:30.772658] Epoch: [0]  [2710/6500]  lr: 0.000021  closs: 0.7809 (0.8284)  grad_norm: 0.5049 (0.7043)  time: 5.5834  data: 0.0002  max mem: 71357
+[21:36:26.600799] Epoch: [0]  [2720/6500]  lr: 0.000021  closs: 0.7460 (0.8281)  grad_norm: 0.5109 (0.7037)  time: 5.5796  data: 0.0002  max mem: 71357
+[21:37:22.396532] Epoch: [0]  [2730/6500]  lr: 0.000021  closs: 0.6897 (0.8277)  grad_norm: 0.4972 (0.7030)  time: 5.5811  data: 0.0001  max mem: 71357
+[21:38:18.216414] Epoch: [0]  [2740/6500]  lr: 0.000021  closs: 0.7477 (0.8275)  grad_norm: 0.4605 (0.7029)  time: 5.5806  data: 0.0002  max mem: 71357
+[21:39:14.267187] Epoch: [0]  [2750/6500]  lr: 0.000021  closs: 0.7866 (0.8273)  grad_norm: 0.4752 (0.7020)  time: 5.5934  data: 0.0002  max mem: 71357
+[21:40:10.212618] Epoch: [0]  [2760/6500]  lr: 0.000021  closs: 0.7746 (0.8272)  grad_norm: 0.4605 (0.7010)  time: 5.5996  data: 0.0002  max mem: 71357
+[21:41:05.958460] Epoch: [0]  [2770/6500]  lr: 0.000021  closs: 0.7746 (0.8272)  grad_norm: 0.4752 (0.7008)  time: 5.5844  data: 0.0002  max mem: 71357
+[21:42:01.770293] Epoch: [0]  [2780/6500]  lr: 0.000021  closs: 0.8009 (0.8272)  grad_norm: 0.4752 (0.7002)  time: 5.5778  data: 0.0002  max mem: 71357
+[21:42:57.678502] Epoch: [0]  [2790/6500]  lr: 0.000021  closs: 0.8009 (0.8271)  grad_norm: 0.4831 (0.7007)  time: 5.5859  data: 0.0002  max mem: 71357
+[21:43:53.448984] Epoch: [0]  [2800/6500]  lr: 0.000022  closs: 0.8074 (0.8271)  grad_norm: 0.5026 (0.7000)  time: 5.5838  data: 0.0002  max mem: 71357
+[21:44:49.284849] Epoch: [0]  [2810/6500]  lr: 0.000022  closs: 0.8074 (0.8270)  grad_norm: 0.4758 (0.6991)  time: 5.5802  data: 0.0002  max mem: 71357
+[21:45:45.008739] Epoch: [0]  [2820/6500]  lr: 0.000022  closs: 0.7502 (0.8266)  grad_norm: 0.4573 (0.6983)  time: 5.5778  data: 0.0002  max mem: 71357
+[21:46:40.941467] Epoch: [0]  [2830/6500]  lr: 0.000022  closs: 0.7663 (0.8264)  grad_norm: 0.4320 (0.6973)  time: 5.5827  data: 0.0002  max mem: 71357
+[21:47:36.741840] Epoch: [0]  [2840/6500]  lr: 0.000022  closs: 0.8014 (0.8264)  grad_norm: 0.4285 (0.6973)  time: 5.5866  data: 0.0002  max mem: 71357
+[21:48:32.454452] Epoch: [0]  [2850/6500]  lr: 0.000022  closs: 0.8047 (0.8261)  grad_norm: 0.4406 (0.6966)  time: 5.5756  data: 0.0001  max mem: 71357
+[21:49:28.166471] Epoch: [0]  [2860/6500]  lr: 0.000022  closs: 0.6620 (0.8256)  grad_norm: 0.4406 (0.6963)  time: 5.5711  data: 0.0001  max mem: 71357
+[21:50:23.965017] Epoch: [0]  [2870/6500]  lr: 0.000022  closs: 0.6841 (0.8253)  grad_norm: 0.5207 (0.6958)  time: 5.5754  data: 0.0002  max mem: 71357
+[21:51:19.762370] Epoch: [0]  [2880/6500]  lr: 0.000022  closs: 0.7397 (0.8252)  grad_norm: 0.5657 (0.6979)  time: 5.5797  data: 0.0002  max mem: 71357
+[21:52:15.502191] Epoch: [0]  [2890/6500]  lr: 0.000022  closs: 0.8410 (0.8254)  grad_norm: 0.5332 (0.6971)  time: 5.5767  data: 0.0001  max mem: 71357
+[21:53:11.116475] Epoch: [0]  [2900/6500]  lr: 0.000022  closs: 0.7739 (0.8251)  grad_norm: 0.5285 (0.6964)  time: 5.5676  data: 0.0001  max mem: 71357
+[21:54:06.797736] Epoch: [0]  [2910/6500]  lr: 0.000022  closs: 0.7956 (0.8252)  grad_norm: 0.5048 (0.6961)  time: 5.5647  data: 0.0001  max mem: 71357
+[21:55:02.572561] Epoch: [0]  [2920/6500]  lr: 0.000022  closs: 0.8117 (0.8251)  grad_norm: 0.4933 (0.6954)  time: 5.5727  data: 0.0001  max mem: 71357
+[21:55:58.265640] Epoch: [0]  [2930/6500]  lr: 0.000023  closs: 0.7544 (0.8247)  grad_norm: 0.4933 (0.6957)  time: 5.5732  data: 0.0002  max mem: 71357
+[21:56:53.952100] Epoch: [0]  [2940/6500]  lr: 0.000023  closs: 0.8150 (0.8249)  grad_norm: 0.5311 (0.6952)  time: 5.5689  data: 0.0002  max mem: 71357
+[21:57:49.722026] Epoch: [0]  [2950/6500]  lr: 0.000023  closs: 0.8308 (0.8248)  grad_norm: 0.5311 (0.6948)  time: 5.5727  data: 0.0001  max mem: 71357
+[21:58:45.409923] Epoch: [0]  [2960/6500]  lr: 0.000023  closs: 0.8063 (0.8248)  grad_norm: 0.4842 (0.6945)  time: 5.5728  data: 0.0002  max mem: 71357
+[21:59:41.250401] Epoch: [0]  [2970/6500]  lr: 0.000023  closs: 0.8054 (0.8246)  grad_norm: 0.4842 (0.6939)  time: 5.5763  data: 0.0002  max mem: 71357
+[22:00:36.996921] Epoch: [0]  [2980/6500]  lr: 0.000023  closs: 0.7675 (0.8245)  grad_norm: 0.4995 (0.6935)  time: 5.5793  data: 0.0002  max mem: 71357
+[22:01:32.739821] Epoch: [0]  [2990/6500]  lr: 0.000023  closs: 0.7084 (0.8242)  grad_norm: 0.5038 (0.6933)  time: 5.5744  data: 0.0002  max mem: 71357
+[22:02:28.417654] Epoch: [0]  [3000/6500]  lr: 0.000023  closs: 0.7755 (0.8242)  grad_norm: 0.5038 (0.6928)  time: 5.5709  data: 0.0002  max mem: 71357
+[22:03:24.195424] Epoch: [0]  [3010/6500]  lr: 0.000023  closs: 0.7861 (0.8242)  grad_norm: 0.5038 (0.6922)  time: 5.5727  data: 0.0001  max mem: 71357
+[22:04:19.925026] Epoch: [0]  [3020/6500]  lr: 0.000023  closs: 0.7833 (0.8241)  grad_norm: 0.4425 (0.6914)  time: 5.5753  data: 0.0001  max mem: 71357
+[22:05:15.625126] Epoch: [0]  [3030/6500]  lr: 0.000023  closs: 0.8012 (0.8242)  grad_norm: 0.4276 (0.6903)  time: 5.5713  data: 0.0002  max mem: 71357
+[22:06:11.301289] Epoch: [0]  [3040/6500]  lr: 0.000023  closs: 0.8578 (0.8245)  grad_norm: 0.4276 (0.6900)  time: 5.5687  data: 0.0002  max mem: 71357
+[22:07:07.112997] Epoch: [0]  [3050/6500]  lr: 0.000023  closs: 0.7844 (0.8243)  grad_norm: 0.4425 (0.6896)  time: 5.5743  data: 0.0001  max mem: 71357
+[22:08:02.885455] Epoch: [0]  [3060/6500]  lr: 0.000024  closs: 0.7363 (0.8239)  grad_norm: 0.4420 (0.6890)  time: 5.5791  data: 0.0001  max mem: 71357
+[22:08:58.574922] Epoch: [0]  [3070/6500]  lr: 0.000024  closs: 0.7478 (0.8239)  grad_norm: 0.4767 (0.6882)  time: 5.5730  data: 0.0001  max mem: 71357
+[22:09:54.218364] Epoch: [0]  [3080/6500]  lr: 0.000024  closs: 0.7478 (0.8238)  grad_norm: 0.4767 (0.6877)  time: 5.5666  data: 0.0001  max mem: 71357
+[22:10:49.953148] Epoch: [0]  [3090/6500]  lr: 0.000024  closs: 0.8290 (0.8239)  grad_norm: 0.4707 (0.6872)  time: 5.5688  data: 0.0002  max mem: 71357
+[22:11:45.777210] Epoch: [0]  [3100/6500]  lr: 0.000024  closs: 0.8290 (0.8241)  grad_norm: 0.5149 (0.6872)  time: 5.5778  data: 0.0002  max mem: 71357
+[22:12:41.446476] Epoch: [0]  [3110/6500]  lr: 0.000024  closs: 0.7752 (0.8239)  grad_norm: 0.5202 (0.6871)  time: 5.5746  data: 0.0002  max mem: 71357
+[22:13:37.138604] Epoch: [0]  [3120/6500]  lr: 0.000024  closs: 0.7729 (0.8238)  grad_norm: 0.5083 (0.6864)  time: 5.5680  data: 0.0001  max mem: 71357
+[22:14:32.885980] Epoch: [0]  [3130/6500]  lr: 0.000024  closs: 0.7892 (0.8237)  grad_norm: 0.5083 (0.6860)  time: 5.5719  data: 0.0001  max mem: 71357
+[22:15:28.734038] Epoch: [0]  [3140/6500]  lr: 0.000024  closs: 0.7201 (0.8233)  grad_norm: 0.4951 (0.6855)  time: 5.5797  data: 0.0001  max mem: 71357
+[22:16:24.401364] Epoch: [0]  [3150/6500]  lr: 0.000024  closs: 0.7225 (0.8231)  grad_norm: 0.4951 (0.6855)  time: 5.5757  data: 0.0001  max mem: 71357
+[22:17:20.190194] Epoch: [0]  [3160/6500]  lr: 0.000024  closs: 0.7225 (0.8227)  grad_norm: 0.5478 (0.6851)  time: 5.5727  data: 0.0002  max mem: 71357
+[22:18:15.930741] Epoch: [0]  [3170/6500]  lr: 0.000024  closs: 0.6877 (0.8226)  grad_norm: 0.5478 (0.6849)  time: 5.5764  data: 0.0002  max mem: 71357
+[22:19:11.777748] Epoch: [0]  [3180/6500]  lr: 0.000024  closs: 0.7892 (0.8225)  grad_norm: 0.5051 (0.6841)  time: 5.5793  data: 0.0002  max mem: 71357
+[22:20:07.668122] Epoch: [0]  [3190/6500]  lr: 0.000025  closs: 0.7907 (0.8225)  grad_norm: 0.4834 (0.6836)  time: 5.5868  data: 0.0002  max mem: 71357
+[22:21:03.530895] Epoch: [0]  [3200/6500]  lr: 0.000025  closs: 0.7862 (0.8223)  grad_norm: 0.4458 (0.6829)  time: 5.5876  data: 0.0002  max mem: 71357
+[22:21:59.265653] Epoch: [0]  [3210/6500]  lr: 0.000025  closs: 0.8013 (0.8224)  grad_norm: 0.4373 (0.6825)  time: 5.5798  data: 0.0001  max mem: 71357
+[22:22:54.998366] Epoch: [0]  [3220/6500]  lr: 0.000025  closs: 0.8076 (0.8222)  grad_norm: 0.4758 (0.6823)  time: 5.5733  data: 0.0002  max mem: 71357
+[22:23:50.905774] Epoch: [0]  [3230/6500]  lr: 0.000025  closs: 0.7747 (0.8219)  grad_norm: 0.4758 (0.6816)  time: 5.5819  data: 0.0002  max mem: 71357
+[22:24:46.627784] Epoch: [0]  [3240/6500]  lr: 0.000025  closs: 0.7195 (0.8217)  grad_norm: 0.4976 (0.6818)  time: 5.5813  data: 0.0002  max mem: 71357
+[22:25:42.340275] Epoch: [0]  [3250/6500]  lr: 0.000025  closs: 0.8083 (0.8220)  grad_norm: 0.4758 (0.6814)  time: 5.5716  data: 0.0002  max mem: 71357
+[22:26:38.235003] Epoch: [0]  [3260/6500]  lr: 0.000025  closs: 0.8261 (0.8221)  grad_norm: 0.4669 (0.6806)  time: 5.5802  data: 0.0002  max mem: 71357
+[22:27:34.143033] Epoch: [0]  [3270/6500]  lr: 0.000025  closs: 0.7940 (0.8220)  grad_norm: 0.4716 (0.6802)  time: 5.5900  data: 0.0002  max mem: 71357
+[22:28:29.848989] Epoch: [0]  [3280/6500]  lr: 0.000025  closs: 0.7949 (0.8218)  grad_norm: 0.4669 (0.6798)  time: 5.5806  data: 0.0002  max mem: 71357
+[22:29:25.534312] Epoch: [0]  [3290/6500]  lr: 0.000025  closs: 0.7710 (0.8220)  grad_norm: 0.4918 (0.6792)  time: 5.5694  data: 0.0002  max mem: 71357
+[22:30:21.210353] Epoch: [0]  [3300/6500]  lr: 0.000025  closs: 0.7941 (0.8220)  grad_norm: 0.4949 (0.6792)  time: 5.5679  data: 0.0002  max mem: 71357
+[22:31:16.927815] Epoch: [0]  [3310/6500]  lr: 0.000025  closs: 0.7941 (0.8216)  grad_norm: 0.4980 (0.6788)  time: 5.5695  data: 0.0002  max mem: 71357
+[22:32:12.719534] Epoch: [0]  [3320/6500]  lr: 0.000026  closs: 0.6892 (0.8212)  grad_norm: 0.4980 (0.6783)  time: 5.5753  data: 0.0001  max mem: 71357
+[22:33:08.513920] Epoch: [0]  [3330/6500]  lr: 0.000026  closs: 0.7701 (0.8211)  grad_norm: 0.5174 (0.6812)  time: 5.5792  data: 0.0001  max mem: 71357
+[22:34:04.304134] Epoch: [0]  [3340/6500]  lr: 0.000026  closs: 0.7871 (0.8210)  grad_norm: 0.4950 (0.6806)  time: 5.5791  data: 0.0001  max mem: 71357
+[22:35:00.047708] Epoch: [0]  [3350/6500]  lr: 0.000026  closs: 0.7350 (0.8207)  grad_norm: 0.4521 (0.6801)  time: 5.5766  data: 0.0001  max mem: 71357
+[22:35:55.874424] Epoch: [0]  [3360/6500]  lr: 0.000026  closs: 0.7149 (0.8205)  grad_norm: 0.4950 (0.6811)  time: 5.5784  data: 0.0002  max mem: 71357
+[22:36:51.608865] Epoch: [0]  [3370/6500]  lr: 0.000026  closs: 0.6943 (0.8199)  grad_norm: 0.4521 (0.6807)  time: 5.5779  data: 0.0002  max mem: 71357
+[22:37:47.359747] Epoch: [0]  [3380/6500]  lr: 0.000026  closs: 0.7499 (0.8199)  grad_norm: 0.5135 (0.6802)  time: 5.5741  data: 0.0002  max mem: 71357
+[22:38:43.055778] Epoch: [0]  [3390/6500]  lr: 0.000026  closs: 0.7804 (0.8198)  grad_norm: 0.5695 (0.6800)  time: 5.5722  data: 0.0002  max mem: 71357
+[22:39:38.791189] Epoch: [0]  [3400/6500]  lr: 0.000026  closs: 0.7784 (0.8198)  grad_norm: 0.5695 (0.6806)  time: 5.5714  data: 0.0002  max mem: 71357
+[22:40:34.701703] Epoch: [0]  [3410/6500]  lr: 0.000026  closs: 0.8139 (0.8198)  grad_norm: 0.5514 (0.6797)  time: 5.5822  data: 0.0002  max mem: 71357
+[22:41:30.512183] Epoch: [0]  [3420/6500]  lr: 0.000026  closs: 0.7679 (0.8196)  grad_norm: 0.5098 (0.6790)  time: 5.5860  data: 0.0002  max mem: 71357
+[22:42:26.256064] Epoch: [0]  [3430/6500]  lr: 0.000026  closs: 0.7416 (0.8196)  grad_norm: 0.4407 (0.6784)  time: 5.5776  data: 0.0002  max mem: 71357
+[22:43:21.943934] Epoch: [0]  [3440/6500]  lr: 0.000026  closs: 0.6919 (0.8193)  grad_norm: 0.3898 (0.6776)  time: 5.5714  data: 0.0002  max mem: 71357
+[22:44:17.761189] Epoch: [0]  [3450/6500]  lr: 0.000027  closs: 0.7952 (0.8193)  grad_norm: 0.3898 (0.6772)  time: 5.5751  data: 0.0002  max mem: 71357
+[22:45:13.506509] Epoch: [0]  [3460/6500]  lr: 0.000027  closs: 0.8043 (0.8192)  grad_norm: 0.3898 (0.6770)  time: 5.5780  data: 0.0001  max mem: 71357
+[22:46:09.269380] Epoch: [0]  [3470/6500]  lr: 0.000027  closs: 0.7872 (0.8192)  grad_norm: 0.4201 (0.6766)  time: 5.5753  data: 0.0002  max mem: 71357
+[22:47:04.958374] Epoch: [0]  [3480/6500]  lr: 0.000027  closs: 0.7538 (0.8190)  grad_norm: 0.4742 (0.6763)  time: 5.5725  data: 0.0002  max mem: 71357
+[22:48:00.838506] Epoch: [0]  [3490/6500]  lr: 0.000027  closs: 0.7062 (0.8188)  grad_norm: 0.4572 (0.6762)  time: 5.5784  data: 0.0001  max mem: 71357
+[22:48:56.554925] Epoch: [0]  [3500/6500]  lr: 0.000027  closs: 0.7649 (0.8187)  grad_norm: 0.4557 (0.6755)  time: 5.5797  data: 0.0001  max mem: 71357
+[22:49:52.356055] Epoch: [0]  [3510/6500]  lr: 0.000027  closs: 0.7740 (0.8186)  grad_norm: 0.4557 (0.6749)  time: 5.5757  data: 0.0001  max mem: 71357
+[22:50:48.051775] Epoch: [0]  [3520/6500]  lr: 0.000027  closs: 0.6907 (0.8184)  grad_norm: 0.4465 (0.6745)  time: 5.5747  data: 0.0002  max mem: 71357
+[22:51:43.736248] Epoch: [0]  [3530/6500]  lr: 0.000027  closs: 0.7185 (0.8184)  grad_norm: 0.4883 (0.6741)  time: 5.5689  data: 0.0001  max mem: 71357
+[22:52:39.582328] Epoch: [0]  [3540/6500]  lr: 0.000027  closs: 0.7792 (0.8182)  grad_norm: 0.4887 (0.6737)  time: 5.5765  data: 0.0001  max mem: 71357
+[22:53:35.217988] Epoch: [0]  [3550/6500]  lr: 0.000027  closs: 0.7264 (0.8181)  grad_norm: 0.4923 (0.6732)  time: 5.5740  data: 0.0001  max mem: 71357
+[22:54:31.016009] Epoch: [0]  [3560/6500]  lr: 0.000027  closs: 0.7264 (0.8179)  grad_norm: 0.4923 (0.6727)  time: 5.5716  data: 0.0002  max mem: 71357
+[22:55:26.802300] Epoch: [0]  [3570/6500]  lr: 0.000027  closs: 0.7620 (0.8178)  grad_norm: 0.4646 (0.6722)  time: 5.5791  data: 0.0002  max mem: 71357
+[22:56:22.687278] Epoch: [0]  [3580/6500]  lr: 0.000028  closs: 0.7688 (0.8178)  grad_norm: 0.4572 (0.6717)  time: 5.5835  data: 0.0001  max mem: 71357
+[22:57:18.492948] Epoch: [0]  [3590/6500]  lr: 0.000028  closs: 0.8586 (0.8179)  grad_norm: 0.4335 (0.6714)  time: 5.5844  data: 0.0002  max mem: 71357
+[22:58:14.266317] Epoch: [0]  [3600/6500]  lr: 0.000028  closs: 0.8586 (0.8181)  grad_norm: 0.4301 (0.6708)  time: 5.5788  data: 0.0002  max mem: 71357
+[22:59:10.030708] Epoch: [0]  [3610/6500]  lr: 0.000028  closs: 0.7420 (0.8178)  grad_norm: 0.4168 (0.6702)  time: 5.5767  data: 0.0001  max mem: 71357
+[23:00:05.740740] Epoch: [0]  [3620/6500]  lr: 0.000028  closs: 0.6845 (0.8177)  grad_norm: 0.4147 (0.6697)  time: 5.5736  data: 0.0002  max mem: 71357
+[23:01:01.628084] Epoch: [0]  [3630/6500]  lr: 0.000028  closs: 0.7360 (0.8176)  grad_norm: 0.4531 (0.6693)  time: 5.5798  data: 0.0002  max mem: 71357
+[23:01:57.312973] Epoch: [0]  [3640/6500]  lr: 0.000028  closs: 0.7370 (0.8177)  grad_norm: 0.4726 (0.6689)  time: 5.5785  data: 0.0001  max mem: 71357
+[23:02:53.018097] Epoch: [0]  [3650/6500]  lr: 0.000028  closs: 0.7370 (0.8174)  grad_norm: 0.4726 (0.6684)  time: 5.5694  data: 0.0001  max mem: 71357
+[23:03:48.696651] Epoch: [0]  [3660/6500]  lr: 0.000028  closs: 0.7332 (0.8172)  grad_norm: 0.4864 (0.6678)  time: 5.5691  data: 0.0001  max mem: 71357
+[23:04:44.522063] Epoch: [0]  [3670/6500]  lr: 0.000028  closs: 0.7933 (0.8171)  grad_norm: 0.4716 (0.6672)  time: 5.5751  data: 0.0001  max mem: 71357
+[23:05:40.229307] Epoch: [0]  [3680/6500]  lr: 0.000028  closs: 0.7823 (0.8170)  grad_norm: 0.4653 (0.6668)  time: 5.5765  data: 0.0002  max mem: 71357
+[23:06:36.012740] Epoch: [0]  [3690/6500]  lr: 0.000028  closs: 0.7470 (0.8169)  grad_norm: 0.4864 (0.6664)  time: 5.5744  data: 0.0002  max mem: 71357
+[23:07:31.710488] Epoch: [0]  [3700/6500]  lr: 0.000028  closs: 0.7934 (0.8167)  grad_norm: 0.4874 (0.6661)  time: 5.5740  data: 0.0002  max mem: 71357
+[23:08:27.509042] Epoch: [0]  [3710/6500]  lr: 0.000029  closs: 0.8132 (0.8167)  grad_norm: 0.4970 (0.6656)  time: 5.5747  data: 0.0001  max mem: 71357
+[23:09:23.247672] Epoch: [0]  [3720/6500]  lr: 0.000029  closs: 0.7672 (0.8166)  grad_norm: 0.4874 (0.6649)  time: 5.5767  data: 0.0001  max mem: 71357
+[23:10:19.028843] Epoch: [0]  [3730/6500]  lr: 0.000029  closs: 0.8333 (0.8167)  grad_norm: 0.4414 (0.6645)  time: 5.5758  data: 0.0001  max mem: 71357
+[23:11:14.783761] Epoch: [0]  [3740/6500]  lr: 0.000029  closs: 0.7984 (0.8165)  grad_norm: 0.4400 (0.6643)  time: 5.5767  data: 0.0001  max mem: 71357
+[23:12:10.561004] Epoch: [0]  [3750/6500]  lr: 0.000029  closs: 0.7420 (0.8166)  grad_norm: 0.4073 (0.6638)  time: 5.5765  data: 0.0001  max mem: 71357
+[23:13:06.422660] Epoch: [0]  [3760/6500]  lr: 0.000029  closs: 0.7733 (0.8165)  grad_norm: 0.3984 (0.6631)  time: 5.5818  data: 0.0002  max mem: 71357
+[23:14:02.251523] Epoch: [0]  [3770/6500]  lr: 0.000029  closs: 0.7733 (0.8165)  grad_norm: 0.4193 (0.6628)  time: 5.5844  data: 0.0002  max mem: 71357
+[23:14:57.959067] Epoch: [0]  [3780/6500]  lr: 0.000029  closs: 0.7491 (0.8164)  grad_norm: 0.4418 (0.6623)  time: 5.5767  data: 0.0002  max mem: 71357
+[23:15:53.761394] Epoch: [0]  [3790/6500]  lr: 0.000029  closs: 0.7587 (0.8163)  grad_norm: 0.4418 (0.6619)  time: 5.5753  data: 0.0003  max mem: 71357
+[23:16:49.649809] Epoch: [0]  [3800/6500]  lr: 0.000029  closs: 0.7967 (0.8163)  grad_norm: 0.4321 (0.6611)  time: 5.5844  data: 0.0002  max mem: 71357
+[23:17:45.356972] Epoch: [0]  [3810/6500]  lr: 0.000029  closs: 0.7972 (0.8161)  grad_norm: 0.4144 (0.6607)  time: 5.5797  data: 0.0001  max mem: 71357
+[23:18:41.088973] Epoch: [0]  [3820/6500]  lr: 0.000029  closs: 0.7616 (0.8161)  grad_norm: 0.4144 (0.6602)  time: 5.5719  data: 0.0001  max mem: 71357
+[23:19:36.728934] Epoch: [0]  [3830/6500]  lr: 0.000029  closs: 0.7773 (0.8161)  grad_norm: 0.4110 (0.6597)  time: 5.5685  data: 0.0001  max mem: 71357
+[23:20:32.508264] Epoch: [0]  [3840/6500]  lr: 0.000030  closs: 0.7510 (0.8158)  grad_norm: 0.4507 (0.6591)  time: 5.5708  data: 0.0002  max mem: 71357
+[23:21:28.314526] Epoch: [0]  [3850/6500]  lr: 0.000030  closs: 0.7856 (0.8159)  grad_norm: 0.4618 (0.6590)  time: 5.5792  data: 0.0002  max mem: 71357
+[23:22:24.127339] Epoch: [0]  [3860/6500]  lr: 0.000030  closs: 0.7705 (0.8157)  grad_norm: 0.4486 (0.6583)  time: 5.5809  data: 0.0001  max mem: 71357
+[23:23:19.774326] Epoch: [0]  [3870/6500]  lr: 0.000030  closs: 0.7342 (0.8156)  grad_norm: 0.4285 (0.6577)  time: 5.5729  data: 0.0001  max mem: 71357
+[23:24:15.449400] Epoch: [0]  [3880/6500]  lr: 0.000030  closs: 0.8044 (0.8156)  grad_norm: 0.4773 (0.6583)  time: 5.5660  data: 0.0001  max mem: 71357
+[23:25:11.308804] Epoch: [0]  [3890/6500]  lr: 0.000030  closs: 0.8051 (0.8154)  grad_norm: 0.4486 (0.6581)  time: 5.5766  data: 0.0001  max mem: 71357
+[23:26:07.026136] Epoch: [0]  [3900/6500]  lr: 0.000030  closs: 0.8188 (0.8154)  grad_norm: 0.4390 (0.6577)  time: 5.5787  data: 0.0002  max mem: 71357
+[23:27:02.883831] Epoch: [0]  [3910/6500]  lr: 0.000030  closs: 0.8057 (0.8154)  grad_norm: 0.4913 (0.6572)  time: 5.5786  data: 0.0002  max mem: 71357
+[23:27:58.627975] Epoch: [0]  [3920/6500]  lr: 0.000030  closs: 0.7998 (0.8153)  grad_norm: 0.4599 (0.6568)  time: 5.5800  data: 0.0001  max mem: 71357
+[23:28:54.466339] Epoch: [0]  [3930/6500]  lr: 0.000030  closs: 0.8312 (0.8152)  grad_norm: 0.4599 (0.6564)  time: 5.5790  data: 0.0001  max mem: 71357
+[23:29:50.253141] Epoch: [0]  [3940/6500]  lr: 0.000030  closs: 0.7910 (0.8150)  grad_norm: 0.4448 (0.6559)  time: 5.5811  data: 0.0002  max mem: 71357
+[23:30:46.049333] Epoch: [0]  [3950/6500]  lr: 0.000030  closs: 0.7271 (0.8148)  grad_norm: 0.4448 (0.6554)  time: 5.5790  data: 0.0002  max mem: 71357
+[23:31:41.791787] Epoch: [0]  [3960/6500]  lr: 0.000030  closs: 0.7542 (0.8148)  grad_norm: 0.4852 (0.6553)  time: 5.5768  data: 0.0002  max mem: 71357
+[23:32:37.496971] Epoch: [0]  [3970/6500]  lr: 0.000031  closs: 0.7941 (0.8147)  grad_norm: 0.4852 (0.6549)  time: 5.5723  data: 0.0002  max mem: 71357
+[23:33:33.374044] Epoch: [0]  [3980/6500]  lr: 0.000031  closs: 0.7972 (0.8147)  grad_norm: 0.4313 (0.6542)  time: 5.5790  data: 0.0002  max mem: 71357
+[23:34:29.051290] Epoch: [0]  [3990/6500]  lr: 0.000031  closs: 0.7962 (0.8146)  grad_norm: 0.4339 (0.6539)  time: 5.5776  data: 0.0002  max mem: 71357
+[23:35:24.744469] Epoch: [0]  [4000/6500]  lr: 0.000031  closs: 0.7493 (0.8144)  grad_norm: 0.4339 (0.6536)  time: 5.5684  data: 0.0002  max mem: 71357
+[23:36:20.520889] Epoch: [0]  [4010/6500]  lr: 0.000031  closs: 0.7516 (0.8142)  grad_norm: 0.4339 (0.6533)  time: 5.5733  data: 0.0002  max mem: 71357
+[23:37:16.369144] Epoch: [0]  [4020/6500]  lr: 0.000031  closs: 0.8050 (0.8143)  grad_norm: 0.4571 (0.6530)  time: 5.5811  data: 0.0002  max mem: 71357
+[23:38:12.149618] Epoch: [0]  [4030/6500]  lr: 0.000031  closs: 0.8104 (0.8143)  grad_norm: 0.4843 (0.6527)  time: 5.5813  data: 0.0002  max mem: 71357
+[23:39:07.897266] Epoch: [0]  [4040/6500]  lr: 0.000031  closs: 0.7765 (0.8142)  grad_norm: 0.4843 (0.6523)  time: 5.5763  data: 0.0002  max mem: 71357
+[23:40:03.635775] Epoch: [0]  [4050/6500]  lr: 0.000031  closs: 0.7657 (0.8142)  grad_norm: 0.4843 (0.6518)  time: 5.5742  data: 0.0002  max mem: 71357
+[23:40:59.373975] Epoch: [0]  [4060/6500]  lr: 0.000031  closs: 0.8069 (0.8142)  grad_norm: 0.4719 (0.6513)  time: 5.5737  data: 0.0002  max mem: 71357
+[23:41:55.240081] Epoch: [0]  [4070/6500]  lr: 0.000031  closs: 0.8069 (0.8142)  grad_norm: 0.4648 (0.6509)  time: 5.5801  data: 0.0002  max mem: 71357
+[23:42:51.032317] Epoch: [0]  [4080/6500]  lr: 0.000031  closs: 0.8569 (0.8142)  grad_norm: 0.4648 (0.6508)  time: 5.5828  data: 0.0002  max mem: 71357
+[23:43:46.755137] Epoch: [0]  [4090/6500]  lr: 0.000031  closs: 0.8239 (0.8142)  grad_norm: 0.4661 (0.6506)  time: 5.5757  data: 0.0002  max mem: 71357
+[23:44:42.512116] Epoch: [0]  [4100/6500]  lr: 0.000032  closs: 0.7687 (0.8141)  grad_norm: 0.4823 (0.6501)  time: 5.5739  data: 0.0002  max mem: 71357
+[23:45:38.454009] Epoch: [0]  [4110/6500]  lr: 0.000032  closs: 0.7388 (0.8139)  grad_norm: 0.4725 (0.6495)  time: 5.5848  data: 0.0002  max mem: 71357
+[23:46:34.148019] Epoch: [0]  [4120/6500]  lr: 0.000032  closs: 0.7714 (0.8141)  grad_norm: 0.4684 (0.6490)  time: 5.5817  data: 0.0002  max mem: 71357
+[23:47:29.964993] Epoch: [0]  [4130/6500]  lr: 0.000032  closs: 0.7955 (0.8139)  grad_norm: 0.4303 (0.6485)  time: 5.5754  data: 0.0001  max mem: 71357
+[23:48:25.655582] Epoch: [0]  [4140/6500]  lr: 0.000032  closs: 0.6973 (0.8136)  grad_norm: 0.4611 (0.6488)  time: 5.5753  data: 0.0001  max mem: 71357
+[23:49:21.369932] Epoch: [0]  [4150/6500]  lr: 0.000032  closs: 0.7174 (0.8135)  grad_norm: 0.4709 (0.6485)  time: 5.5702  data: 0.0001  max mem: 71357
+[23:50:17.071274] Epoch: [0]  [4160/6500]  lr: 0.000032  closs: 0.7410 (0.8134)  grad_norm: 0.4709 (0.6480)  time: 5.5707  data: 0.0001  max mem: 71357
+[23:51:12.793801] Epoch: [0]  [4170/6500]  lr: 0.000032  closs: 0.6819 (0.8131)  grad_norm: 0.4439 (0.6479)  time: 5.5711  data: 0.0001  max mem: 71357
+[23:52:08.637169] Epoch: [0]  [4180/6500]  lr: 0.000032  closs: 0.6791 (0.8130)  grad_norm: 0.4330 (0.6475)  time: 5.5782  data: 0.0001  max mem: 71357
+[23:53:04.424045] Epoch: [0]  [4190/6500]  lr: 0.000032  closs: 0.6955 (0.8127)  grad_norm: 0.4330 (0.6469)  time: 5.5814  data: 0.0002  max mem: 71357
+[23:54:00.361724] Epoch: [0]  [4200/6500]  lr: 0.000032  closs: 0.7177 (0.8125)  grad_norm: 0.3919 (0.6463)  time: 5.5861  data: 0.0002  max mem: 71357
+[23:54:56.077357] Epoch: [0]  [4210/6500]  lr: 0.000032  closs: 0.7655 (0.8125)  grad_norm: 0.3950 (0.6458)  time: 5.5826  data: 0.0002  max mem: 71357
+[23:55:51.890551] Epoch: [0]  [4220/6500]  lr: 0.000032  closs: 0.7983 (0.8125)  grad_norm: 0.3997 (0.6454)  time: 5.5764  data: 0.0002  max mem: 71357
+[23:56:47.608587] Epoch: [0]  [4230/6500]  lr: 0.000033  closs: 0.8221 (0.8125)  grad_norm: 0.4214 (0.6449)  time: 5.5765  data: 0.0002  max mem: 71357
+[23:57:43.392132] Epoch: [0]  [4240/6500]  lr: 0.000033  closs: 0.7227 (0.8123)  grad_norm: 0.4214 (0.6444)  time: 5.5750  data: 0.0001  max mem: 71357
+[23:58:39.182546] Epoch: [0]  [4250/6500]  lr: 0.000033  closs: 0.7145 (0.8122)  grad_norm: 0.4214 (0.6440)  time: 5.5786  data: 0.0001  max mem: 71357
+[23:59:34.962503] Epoch: [0]  [4260/6500]  lr: 0.000033  closs: 0.7968 (0.8123)  grad_norm: 0.4069 (0.6434)  time: 5.5784  data: 0.0001  max mem: 71357
+[00:00:30.653249] Epoch: [0]  [4270/6500]  lr: 0.000033  closs: 0.8109 (0.8125)  grad_norm: 0.3782 (0.6429)  time: 5.5734  data: 0.0001  max mem: 71357
+[00:01:26.507609] Epoch: [0]  [4280/6500]  lr: 0.000033  closs: 0.8543 (0.8124)  grad_norm: 0.3807 (0.6423)  time: 5.5772  data: 0.0001  max mem: 71357
+[00:02:22.266367] Epoch: [0]  [4290/6500]  lr: 0.000033  closs: 0.7564 (0.8124)  grad_norm: 0.3910 (0.6420)  time: 5.5806  data: 0.0001  max mem: 71357
+[00:03:18.041885] Epoch: [0]  [4300/6500]  lr: 0.000033  closs: 0.7627 (0.8123)  grad_norm: 0.4018 (0.6417)  time: 5.5766  data: 0.0001  max mem: 71357
+[00:04:13.802843] Epoch: [0]  [4310/6500]  lr: 0.000033  closs: 0.7162 (0.8120)  grad_norm: 0.4247 (0.6412)  time: 5.5767  data: 0.0001  max mem: 71357
+[00:05:09.467895] Epoch: [0]  [4320/6500]  lr: 0.000033  closs: 0.6992 (0.8118)  grad_norm: 0.4258 (0.6409)  time: 5.5712  data: 0.0001  max mem: 71357
+[00:06:05.337634] Epoch: [0]  [4330/6500]  lr: 0.000033  closs: 0.7296 (0.8117)  grad_norm: 0.4258 (0.6405)  time: 5.5766  data: 0.0002  max mem: 71357
+[00:07:01.137398] Epoch: [0]  [4340/6500]  lr: 0.000033  closs: 0.7877 (0.8118)  grad_norm: 0.4247 (0.6401)  time: 5.5834  data: 0.0002  max mem: 71357
+[00:07:56.821784] Epoch: [0]  [4350/6500]  lr: 0.000033  closs: 0.8040 (0.8117)  grad_norm: 0.4431 (0.6400)  time: 5.5741  data: 0.0002  max mem: 71357
+[00:08:52.489372] Epoch: [0]  [4360/6500]  lr: 0.000034  closs: 0.7670 (0.8116)  grad_norm: 0.4431 (0.6398)  time: 5.5675  data: 0.0001  max mem: 71357
+[00:09:48.335184] Epoch: [0]  [4370/6500]  lr: 0.000034  closs: 0.7801 (0.8116)  grad_norm: 0.4316 (0.6393)  time: 5.5755  data: 0.0002  max mem: 71357
+[00:10:44.089817] Epoch: [0]  [4380/6500]  lr: 0.000034  closs: 0.7409 (0.8114)  grad_norm: 0.4314 (0.6389)  time: 5.5799  data: 0.0002  max mem: 71357
+[00:11:39.833736] Epoch: [0]  [4390/6500]  lr: 0.000034  closs: 0.7122 (0.8114)  grad_norm: 0.4360 (0.6385)  time: 5.5748  data: 0.0002  max mem: 71357
+[00:12:35.503403] Epoch: [0]  [4400/6500]  lr: 0.000034  closs: 0.7840 (0.8113)  grad_norm: 0.4360 (0.6384)  time: 5.5705  data: 0.0001  max mem: 71357
+[00:13:31.186127] Epoch: [0]  [4410/6500]  lr: 0.000034  closs: 0.7011 (0.8111)  grad_norm: 0.4577 (0.6383)  time: 5.5675  data: 0.0001  max mem: 71357
+[00:14:26.967773] Epoch: [0]  [4420/6500]  lr: 0.000034  closs: 0.7771 (0.8112)  grad_norm: 0.4687 (0.6380)  time: 5.5731  data: 0.0001  max mem: 71357
+[00:15:22.663366] Epoch: [0]  [4430/6500]  lr: 0.000034  closs: 0.7935 (0.8111)  grad_norm: 0.4562 (0.6382)  time: 5.5738  data: 0.0001  max mem: 71357
+[00:16:18.415629] Epoch: [0]  [4440/6500]  lr: 0.000034  closs: 0.7374 (0.8107)  grad_norm: 0.4562 (0.6377)  time: 5.5723  data: 0.0002  max mem: 71357
+[00:17:14.098438] Epoch: [0]  [4450/6500]  lr: 0.000034  closs: 0.7220 (0.8107)  grad_norm: 0.4404 (0.6374)  time: 5.5716  data: 0.0002  max mem: 71357
+[00:18:09.928472] Epoch: [0]  [4460/6500]  lr: 0.000034  closs: 0.7684 (0.8105)  grad_norm: 0.4403 (0.6373)  time: 5.5755  data: 0.0001  max mem: 71357
+[00:19:05.687721] Epoch: [0]  [4470/6500]  lr: 0.000034  closs: 0.7288 (0.8105)  grad_norm: 0.4834 (0.6370)  time: 5.5793  data: 0.0001  max mem: 71357
+[00:20:01.374623] Epoch: [0]  [4480/6500]  lr: 0.000034  closs: 0.8508 (0.8105)  grad_norm: 0.4678 (0.6365)  time: 5.5722  data: 0.0001  max mem: 71357
+[00:20:57.038781] Epoch: [0]  [4490/6500]  lr: 0.000035  closs: 0.8271 (0.8105)  grad_norm: 0.4790 (0.6363)  time: 5.5674  data: 0.0001  max mem: 71357
+[00:21:52.724399] Epoch: [0]  [4500/6500]  lr: 0.000035  closs: 0.8136 (0.8104)  grad_norm: 0.4834 (0.6360)  time: 5.5674  data: 0.0001  max mem: 71357
+[00:22:48.517686] Epoch: [0]  [4510/6500]  lr: 0.000035  closs: 0.7913 (0.8104)  grad_norm: 0.4602 (0.6357)  time: 5.5738  data: 0.0001  max mem: 71357
+[00:23:44.328287] Epoch: [0]  [4520/6500]  lr: 0.000035  closs: 0.7664 (0.8102)  grad_norm: 0.4774 (0.6353)  time: 5.5801  data: 0.0001  max mem: 71357
+[00:24:40.071329] Epoch: [0]  [4530/6500]  lr: 0.000035  closs: 0.7084 (0.8101)  grad_norm: 0.4480 (0.6351)  time: 5.5776  data: 0.0001  max mem: 71357
+[00:25:35.770539] Epoch: [0]  [4540/6500]  lr: 0.000035  closs: 0.6961 (0.8099)  grad_norm: 0.4464 (0.6349)  time: 5.5720  data: 0.0001  max mem: 71357
+[00:26:31.605185] Epoch: [0]  [4550/6500]  lr: 0.000035  closs: 0.7117 (0.8096)  grad_norm: 0.4464 (0.6353)  time: 5.5766  data: 0.0001  max mem: 71357
+[00:27:27.408895] Epoch: [0]  [4560/6500]  lr: 0.000035  closs: 0.7221 (0.8096)  grad_norm: 0.4406 (0.6349)  time: 5.5818  data: 0.0001  max mem: 71357
+[00:28:23.168811] Epoch: [0]  [4570/6500]  lr: 0.000035  closs: 0.7814 (0.8096)  grad_norm: 0.4406 (0.6345)  time: 5.5781  data: 0.0001  max mem: 71357
+[00:29:18.810132] Epoch: [0]  [4580/6500]  lr: 0.000035  closs: 0.7730 (0.8094)  grad_norm: 0.4818 (0.6345)  time: 5.5700  data: 0.0001  max mem: 71357
+[00:30:14.742733] Epoch: [0]  [4590/6500]  lr: 0.000035  closs: 0.7002 (0.8092)  grad_norm: 0.4623 (0.6342)  time: 5.5786  data: 0.0001  max mem: 71357
+[00:31:10.481301] Epoch: [0]  [4600/6500]  lr: 0.000035  closs: 0.7432 (0.8091)  grad_norm: 0.4623 (0.6339)  time: 5.5835  data: 0.0001  max mem: 71357
+[00:32:06.158824] Epoch: [0]  [4610/6500]  lr: 0.000035  closs: 0.7692 (0.8091)  grad_norm: 0.5206 (0.6339)  time: 5.5707  data: 0.0001  max mem: 71357
+[00:33:01.810286] Epoch: [0]  [4620/6500]  lr: 0.000036  closs: 0.7397 (0.8088)  grad_norm: 0.5155 (0.6335)  time: 5.5663  data: 0.0001  max mem: 71357
+[00:33:57.482433] Epoch: [0]  [4630/6500]  lr: 0.000036  closs: 0.6831 (0.8086)  grad_norm: 0.5155 (0.6331)  time: 5.5661  data: 0.0001  max mem: 71357
+[00:34:53.309586] Epoch: [0]  [4640/6500]  lr: 0.000036  closs: 0.6817 (0.8084)  grad_norm: 0.5155 (0.6328)  time: 5.5748  data: 0.0001  max mem: 71357
+[00:35:49.041555] Epoch: [0]  [4650/6500]  lr: 0.000036  closs: 0.7070 (0.8081)  grad_norm: 0.4219 (0.6324)  time: 5.5779  data: 0.0001  max mem: 71357
+[00:36:44.777926] Epoch: [0]  [4660/6500]  lr: 0.000036  closs: 0.7158 (0.8080)  grad_norm: 0.4482 (0.6320)  time: 5.5733  data: 0.0001  max mem: 71357
+[00:37:40.419372] Epoch: [0]  [4670/6500]  lr: 0.000036  closs: 0.7453 (0.8080)  grad_norm: 0.4652 (0.6324)  time: 5.5688  data: 0.0001  max mem: 71357
+[00:38:36.219244] Epoch: [0]  [4680/6500]  lr: 0.000036  closs: 0.7802 (0.8080)  grad_norm: 0.4763 (0.6324)  time: 5.5720  data: 0.0001  max mem: 71357
+[00:39:31.970352] Epoch: [0]  [4690/6500]  lr: 0.000036  closs: 0.7953 (0.8080)  grad_norm: 0.5270 (0.6321)  time: 5.5774  data: 0.0001  max mem: 71357
+[00:40:27.645439] Epoch: [0]  [4700/6500]  lr: 0.000036  closs: 0.7213 (0.8078)  grad_norm: 0.5270 (0.6317)  time: 5.5712  data: 0.0001  max mem: 71357
+[00:41:23.345485] Epoch: [0]  [4710/6500]  lr: 0.000036  closs: 0.7213 (0.8077)  grad_norm: 0.4877 (0.6315)  time: 5.5686  data: 0.0002  max mem: 71357
+[00:42:19.100016] Epoch: [0]  [4720/6500]  lr: 0.000036  closs: 0.7877 (0.8077)  grad_norm: 0.4571 (0.6311)  time: 5.5726  data: 0.0002  max mem: 71357
+[00:43:14.868291] Epoch: [0]  [4730/6500]  lr: 0.000036  closs: 0.8066 (0.8077)  grad_norm: 0.4571 (0.6311)  time: 5.5761  data: 0.0001  max mem: 71357
+[00:44:10.617607] Epoch: [0]  [4740/6500]  lr: 0.000036  closs: 0.8107 (0.8077)  grad_norm: 0.4426 (0.6306)  time: 5.5758  data: 0.0001  max mem: 71357
+[00:45:06.457161] Epoch: [0]  [4750/6500]  lr: 0.000037  closs: 0.8107 (0.8076)  grad_norm: 0.4339 (0.6306)  time: 5.5793  data: 0.0001  max mem: 71357
+[00:46:02.224391] Epoch: [0]  [4760/6500]  lr: 0.000037  closs: 0.7232 (0.8074)  grad_norm: 0.4325 (0.6302)  time: 5.5802  data: 0.0002  max mem: 71357
+[00:46:58.073901] Epoch: [0]  [4770/6500]  lr: 0.000037  closs: 0.6645 (0.8072)  grad_norm: 0.3619 (0.6297)  time: 5.5807  data: 0.0002  max mem: 71357
+[00:47:53.806339] Epoch: [0]  [4780/6500]  lr: 0.000037  closs: 0.7087 (0.8071)  grad_norm: 0.3475 (0.6292)  time: 5.5790  data: 0.0001  max mem: 71357
+[00:48:49.478935] Epoch: [0]  [4790/6500]  lr: 0.000037  closs: 0.8572 (0.8073)  grad_norm: 0.4056 (0.6289)  time: 5.5701  data: 0.0001  max mem: 71357
+[00:49:45.256418] Epoch: [0]  [4800/6500]  lr: 0.000037  closs: 0.7900 (0.8071)  grad_norm: 0.4126 (0.6285)  time: 5.5724  data: 0.0001  max mem: 71357
+[00:50:41.081781] Epoch: [0]  [4810/6500]  lr: 0.000037  closs: 0.7765 (0.8071)  grad_norm: 0.4338 (0.6283)  time: 5.5800  data: 0.0001  max mem: 71357
+[00:51:36.744745] Epoch: [0]  [4820/6500]  lr: 0.000037  closs: 0.8162 (0.8073)  grad_norm: 0.4591 (0.6280)  time: 5.5743  data: 0.0001  max mem: 71357
+[00:52:32.438026] Epoch: [0]  [4830/6500]  lr: 0.000037  closs: 0.8205 (0.8073)  grad_norm: 0.4549 (0.6276)  time: 5.5677  data: 0.0001  max mem: 71357
+[00:53:28.073336] Epoch: [0]  [4840/6500]  lr: 0.000037  closs: 0.8205 (0.8074)  grad_norm: 0.4776 (0.6276)  time: 5.5664  data: 0.0001  max mem: 71357
+[00:54:23.863171] Epoch: [0]  [4850/6500]  lr: 0.000037  closs: 0.8377 (0.8073)  grad_norm: 0.4549 (0.6272)  time: 5.5712  data: 0.0001  max mem: 71357
+[00:55:19.616326] Epoch: [0]  [4860/6500]  lr: 0.000037  closs: 0.7501 (0.8072)  grad_norm: 0.4700 (0.6269)  time: 5.5771  data: 0.0001  max mem: 71357
+[00:56:15.311381] Epoch: [0]  [4870/6500]  lr: 0.000037  closs: 0.7358 (0.8071)  grad_norm: 0.4839 (0.6266)  time: 5.5723  data: 0.0002  max mem: 71357
+[00:57:10.987223] Epoch: [0]  [4880/6500]  lr: 0.000038  closs: 0.7905 (0.8072)  grad_norm: 0.4700 (0.6265)  time: 5.5684  data: 0.0002  max mem: 71357
+[00:58:06.697477] Epoch: [0]  [4890/6500]  lr: 0.000038  closs: 0.8178 (0.8072)  grad_norm: 0.4622 (0.6262)  time: 5.5692  data: 0.0001  max mem: 71357
+[00:59:02.438896] Epoch: [0]  [4900/6500]  lr: 0.000038  closs: 0.8119 (0.8072)  grad_norm: 0.4416 (0.6260)  time: 5.5725  data: 0.0001  max mem: 71357
+[00:59:58.097865] Epoch: [0]  [4910/6500]  lr: 0.000038  closs: 0.7644 (0.8072)  grad_norm: 0.4335 (0.6256)  time: 5.5699  data: 0.0001  max mem: 71357
+[01:00:53.757132] Epoch: [0]  [4920/6500]  lr: 0.000038  closs: 0.7307 (0.8071)  grad_norm: 0.4416 (0.6254)  time: 5.5658  data: 0.0002  max mem: 71357
+[01:01:49.419719] Epoch: [0]  [4930/6500]  lr: 0.000038  closs: 0.7684 (0.8071)  grad_norm: 0.4416 (0.6251)  time: 5.5660  data: 0.0002  max mem: 71357
+[01:02:45.018759] Epoch: [0]  [4940/6500]  lr: 0.000038  closs: 0.8007 (0.8071)  grad_norm: 0.4659 (0.6250)  time: 5.5630  data: 0.0001  max mem: 71357
+[01:03:40.831339] Epoch: [0]  [4950/6500]  lr: 0.000038  closs: 0.7903 (0.8070)  grad_norm: 0.4948 (0.6247)  time: 5.5705  data: 0.0001  max mem: 71357
+[01:04:36.520963] Epoch: [0]  [4960/6500]  lr: 0.000038  closs: 0.7722 (0.8068)  grad_norm: 0.4659 (0.6243)  time: 5.5750  data: 0.0001  max mem: 71357
+[01:05:32.153893] Epoch: [0]  [4970/6500]  lr: 0.000038  closs: 0.7275 (0.8067)  grad_norm: 0.4508 (0.6239)  time: 5.5660  data: 0.0001  max mem: 71357
+[01:06:27.840249] Epoch: [0]  [4980/6500]  lr: 0.000038  closs: 0.7662 (0.8067)  grad_norm: 0.4232 (0.6235)  time: 5.5658  data: 0.0002  max mem: 71357
+[01:07:23.696178] Epoch: [0]  [4990/6500]  lr: 0.000038  closs: 0.7900 (0.8066)  grad_norm: 0.4029 (0.6231)  time: 5.5770  data: 0.0002  max mem: 71357
+[01:08:19.291258] Epoch: [0]  [5000/6500]  lr: 0.000038  closs: 0.6995 (0.8064)  grad_norm: 0.4029 (0.6229)  time: 5.5725  data: 0.0001  max mem: 71357
+[01:09:14.973960] Epoch: [0]  [5010/6500]  lr: 0.000039  closs: 0.6936 (0.8064)  grad_norm: 0.4029 (0.6225)  time: 5.5638  data: 0.0001  max mem: 71357
+[01:10:10.617873] Epoch: [0]  [5020/6500]  lr: 0.000039  closs: 0.7353 (0.8063)  grad_norm: 0.4315 (0.6223)  time: 5.5663  data: 0.0001  max mem: 71357
+[01:11:06.449934] Epoch: [0]  [5030/6500]  lr: 0.000039  closs: 0.8164 (0.8063)  grad_norm: 0.4530 (0.6219)  time: 5.5737  data: 0.0001  max mem: 71357
+[01:12:02.146675] Epoch: [0]  [5040/6500]  lr: 0.000039  closs: 0.7967 (0.8062)  grad_norm: 0.4530 (0.6215)  time: 5.5763  data: 0.0001  max mem: 71357
+[01:12:57.852880] Epoch: [0]  [5050/6500]  lr: 0.000039  closs: 0.7883 (0.8062)  grad_norm: 0.4232 (0.6211)  time: 5.5700  data: 0.0001  max mem: 71357
+[01:13:53.575353] Epoch: [0]  [5060/6500]  lr: 0.000039  closs: 0.7221 (0.8059)  grad_norm: 0.4012 (0.6207)  time: 5.5713  data: 0.0001  max mem: 71357
+[01:14:49.232927] Epoch: [0]  [5070/6500]  lr: 0.000039  closs: 0.6839 (0.8059)  grad_norm: 0.3975 (0.6202)  time: 5.5689  data: 0.0001  max mem: 71357
+[01:15:45.028965] Epoch: [0]  [5080/6500]  lr: 0.000039  closs: 0.7101 (0.8057)  grad_norm: 0.4012 (0.6198)  time: 5.5726  data: 0.0001  max mem: 71357
+[01:16:40.801994] Epoch: [0]  [5090/6500]  lr: 0.000039  closs: 0.7101 (0.8056)  grad_norm: 0.3859 (0.6193)  time: 5.5783  data: 0.0001  max mem: 71357
+[01:17:36.606893] Epoch: [0]  [5100/6500]  lr: 0.000039  closs: 0.7536 (0.8056)  grad_norm: 0.3690 (0.6189)  time: 5.5788  data: 0.0001  max mem: 71357
+[01:18:32.252167] Epoch: [0]  [5110/6500]  lr: 0.000039  closs: 0.7552 (0.8055)  grad_norm: 0.3921 (0.6186)  time: 5.5724  data: 0.0001  max mem: 71357
+[01:19:28.118493] Epoch: [0]  [5120/6500]  lr: 0.000039  closs: 0.7621 (0.8055)  grad_norm: 0.3921 (0.6181)  time: 5.5755  data: 0.0001  max mem: 71357
+[01:20:23.806918] Epoch: [0]  [5130/6500]  lr: 0.000039  closs: 0.7900 (0.8054)  grad_norm: 0.3971 (0.6178)  time: 5.5776  data: 0.0001  max mem: 71357
+[01:21:19.560680] Epoch: [0]  [5140/6500]  lr: 0.000040  closs: 0.7998 (0.8055)  grad_norm: 0.4023 (0.6177)  time: 5.5720  data: 0.0002  max mem: 71357
+[01:22:15.265374] Epoch: [0]  [5150/6500]  lr: 0.000040  closs: 0.8014 (0.8056)  grad_norm: 0.4023 (0.6174)  time: 5.5728  data: 0.0002  max mem: 71357
+[01:23:10.920939] Epoch: [0]  [5160/6500]  lr: 0.000040  closs: 0.8151 (0.8056)  grad_norm: 0.4297 (0.6172)  time: 5.5679  data: 0.0001  max mem: 71357
+[01:24:06.835511] Epoch: [0]  [5170/6500]  lr: 0.000040  closs: 0.7360 (0.8054)  grad_norm: 0.4297 (0.6169)  time: 5.5784  data: 0.0001  max mem: 71357
+[01:25:02.500269] Epoch: [0]  [5180/6500]  lr: 0.000040  closs: 0.7251 (0.8053)  grad_norm: 0.4110 (0.6165)  time: 5.5789  data: 0.0001  max mem: 71357
+[01:25:58.200527] Epoch: [0]  [5190/6500]  lr: 0.000040  closs: 0.7061 (0.8051)  grad_norm: 0.4110 (0.6162)  time: 5.5681  data: 0.0001  max mem: 71357
+[01:26:53.995989] Epoch: [0]  [5200/6500]  lr: 0.000040  closs: 0.7137 (0.8051)  grad_norm: 0.3776 (0.6158)  time: 5.5747  data: 0.0001  max mem: 71357
+[01:27:49.727844] Epoch: [0]  [5210/6500]  lr: 0.000040  closs: 0.7441 (0.8049)  grad_norm: 0.3776 (0.6156)  time: 5.5762  data: 0.0001  max mem: 71357
+[01:28:45.507070] Epoch: [0]  [5220/6500]  lr: 0.000040  closs: 0.7577 (0.8049)  grad_norm: 0.3804 (0.6153)  time: 5.5754  data: 0.0001  max mem: 71357
+[01:29:41.159681] Epoch: [0]  [5230/6500]  lr: 0.000040  closs: 0.7887 (0.8048)  grad_norm: 0.3804 (0.6169)  time: 5.5715  data: 0.0001  max mem: 71357
+[01:30:36.804523] Epoch: [0]  [5240/6500]  lr: 0.000040  closs: 0.7531 (0.8048)  grad_norm: 0.4002 (0.6164)  time: 5.5648  data: 0.0001  max mem: 71357
+[01:31:32.606137] Epoch: [0]  [5250/6500]  lr: 0.000040  closs: 0.7744 (0.8048)  grad_norm: 0.4071 (0.6161)  time: 5.5722  data: 0.0001  max mem: 71357
+[01:32:28.301729] Epoch: [0]  [5260/6500]  lr: 0.000040  closs: 0.8277 (0.8048)  grad_norm: 0.4002 (0.6158)  time: 5.5747  data: 0.0001  max mem: 71357
+[01:33:24.112652] Epoch: [0]  [5270/6500]  lr: 0.000041  closs: 0.7664 (0.8048)  grad_norm: 0.3901 (0.6154)  time: 5.5752  data: 0.0001  max mem: 71357
+[01:34:19.868194] Epoch: [0]  [5280/6500]  lr: 0.000041  closs: 0.7340 (0.8046)  grad_norm: 0.3901 (0.6151)  time: 5.5782  data: 0.0001  max mem: 71357
+[01:35:15.606432] Epoch: [0]  [5290/6500]  lr: 0.000041  closs: 0.6905 (0.8045)  grad_norm: 0.3896 (0.6146)  time: 5.5746  data: 0.0001  max mem: 71357
+[01:36:11.383142] Epoch: [0]  [5300/6500]  lr: 0.000041  closs: 0.7767 (0.8045)  grad_norm: 0.3896 (0.6143)  time: 5.5757  data: 0.0001  max mem: 71357
+[01:37:07.009457] Epoch: [0]  [5310/6500]  lr: 0.000041  closs: 0.7287 (0.8044)  grad_norm: 0.3921 (0.6145)  time: 5.5701  data: 0.0001  max mem: 71357
+[01:38:02.669739] Epoch: [0]  [5320/6500]  lr: 0.000041  closs: 0.7413 (0.8044)  grad_norm: 0.4500 (0.6148)  time: 5.5642  data: 0.0001  max mem: 71357
+[01:38:58.422769] Epoch: [0]  [5330/6500]  lr: 0.000041  closs: 0.7950 (0.8044)  grad_norm: 0.4939 (0.6144)  time: 5.5706  data: 0.0001  max mem: 71357
+[01:39:54.248251] Epoch: [0]  [5340/6500]  lr: 0.000041  closs: 0.7414 (0.8043)  grad_norm: 0.5255 (0.6143)  time: 5.5788  data: 0.0001  max mem: 71357
+[01:40:50.000851] Epoch: [0]  [5350/6500]  lr: 0.000041  closs: 0.7228 (0.8042)  grad_norm: 0.4853 (0.6139)  time: 5.5788  data: 0.0001  max mem: 71357
+[01:41:45.707984] Epoch: [0]  [5360/6500]  lr: 0.000041  closs: 0.7594 (0.8041)  grad_norm: 0.4699 (0.6136)  time: 5.5729  data: 0.0001  max mem: 71357
+[01:42:41.349859] Epoch: [0]  [5370/6500]  lr: 0.000041  closs: 0.7594 (0.8040)  grad_norm: 0.4699 (0.6133)  time: 5.5673  data: 0.0001  max mem: 71357
+[01:43:37.059969] Epoch: [0]  [5380/6500]  lr: 0.000041  closs: 0.7105 (0.8039)  grad_norm: 0.4386 (0.6130)  time: 5.5675  data: 0.0001  max mem: 71357
+[01:44:32.852298] Epoch: [0]  [5390/6500]  lr: 0.000041  closs: 0.7781 (0.8039)  grad_norm: 0.4505 (0.6128)  time: 5.5750  data: 0.0001  max mem: 71357
+[01:45:28.525634] Epoch: [0]  [5400/6500]  lr: 0.000042  closs: 0.7665 (0.8038)  grad_norm: 0.4386 (0.6126)  time: 5.5732  data: 0.0001  max mem: 71357
+[01:46:24.224107] Epoch: [0]  [5410/6500]  lr: 0.000042  closs: 0.7346 (0.8037)  grad_norm: 0.4365 (0.6124)  time: 5.5685  data: 0.0002  max mem: 71357
+[01:47:19.853525] Epoch: [0]  [5420/6500]  lr: 0.000042  closs: 0.7093 (0.8034)  grad_norm: 0.5518 (0.6123)  time: 5.5663  data: 0.0002  max mem: 71357
+[01:48:15.651981] Epoch: [0]  [5430/6500]  lr: 0.000042  closs: 0.6989 (0.8033)  grad_norm: 0.4783 (0.6121)  time: 5.5713  data: 0.0001  max mem: 71357
+[01:49:11.451075] Epoch: [0]  [5440/6500]  lr: 0.000042  closs: 0.7255 (0.8033)  grad_norm: 0.4431 (0.6119)  time: 5.5797  data: 0.0001  max mem: 71357
+[01:50:07.107339] Epoch: [0]  [5450/6500]  lr: 0.000042  closs: 0.7646 (0.8033)  grad_norm: 0.4431 (0.6116)  time: 5.5726  data: 0.0001  max mem: 71357
+[01:51:02.882550] Epoch: [0]  [5460/6500]  lr: 0.000042  closs: 0.7432 (0.8032)  grad_norm: 0.4143 (0.6113)  time: 5.5715  data: 0.0002  max mem: 71357
+[01:51:58.596127] Epoch: [0]  [5470/6500]  lr: 0.000042  closs: 0.7326 (0.8032)  grad_norm: 0.4256 (0.6112)  time: 5.5744  data: 0.0002  max mem: 71357
+[01:52:54.291718] Epoch: [0]  [5480/6500]  lr: 0.000042  closs: 0.7713 (0.8033)  grad_norm: 0.4343 (0.6207)  time: 5.5704  data: 0.0001  max mem: 71357
+[01:53:50.039357] Epoch: [0]  [5490/6500]  lr: 0.000042  closs: 0.7713 (0.8033)  grad_norm: 0.4256 (0.6203)  time: 5.5720  data: 0.0001  max mem: 71357
+[01:54:45.630943] Epoch: [0]  [5500/6500]  lr: 0.000042  closs: 0.7862 (0.8032)  grad_norm: 0.4718 (0.6201)  time: 5.5668  data: 0.0001  max mem: 71357
+[01:55:41.348185] Epoch: [0]  [5510/6500]  lr: 0.000042  closs: 0.7569 (0.8031)  grad_norm: 0.4718 (0.6200)  time: 5.5653  data: 0.0001  max mem: 71357
+[01:56:37.065250] Epoch: [0]  [5520/6500]  lr: 0.000042  closs: 0.7050 (0.8030)  grad_norm: 0.4453 (0.6197)  time: 5.5716  data: 0.0002  max mem: 71357
+[01:57:32.797146] Epoch: [0]  [5530/6500]  lr: 0.000043  closs: 0.7246 (0.8030)  grad_norm: 0.4408 (0.6194)  time: 5.5723  data: 0.0002  max mem: 71357
+[01:58:28.499852] Epoch: [0]  [5540/6500]  lr: 0.000043  closs: 0.7876 (0.8030)  grad_norm: 0.4408 (0.6191)  time: 5.5716  data: 0.0001  max mem: 71357
+[01:59:24.128893] Epoch: [0]  [5550/6500]  lr: 0.000043  closs: 0.7692 (0.8030)  grad_norm: 0.4469 (0.6190)  time: 5.5665  data: 0.0001  max mem: 71357
+[02:00:19.963717] Epoch: [0]  [5560/6500]  lr: 0.000043  closs: 0.6882 (0.8026)  grad_norm: 0.4469 (0.6187)  time: 5.5731  data: 0.0001  max mem: 71357
+[02:01:15.615241] Epoch: [0]  [5570/6500]  lr: 0.000043  closs: 0.7543 (0.8027)  grad_norm: 0.4919 (0.6188)  time: 5.5742  data: 0.0002  max mem: 71357
+[02:02:11.280741] Epoch: [0]  [5580/6500]  lr: 0.000043  closs: 0.7857 (0.8028)  grad_norm: 0.5595 (0.6187)  time: 5.5657  data: 0.0002  max mem: 71357
+[02:03:06.899155] Epoch: [0]  [5590/6500]  lr: 0.000043  closs: 0.7333 (0.8028)  grad_norm: 0.5595 (0.6185)  time: 5.5641  data: 0.0001  max mem: 71357
+[02:04:02.557866] Epoch: [0]  [5600/6500]  lr: 0.000043  closs: 0.7701 (0.8029)  grad_norm: 0.4936 (0.6182)  time: 5.5637  data: 0.0001  max mem: 71357
+[02:04:58.346233] Epoch: [0]  [5610/6500]  lr: 0.000043  closs: 0.7756 (0.8028)  grad_norm: 0.4655 (0.6179)  time: 5.5722  data: 0.0001  max mem: 71357
+[02:05:54.107394] Epoch: [0]  [5620/6500]  lr: 0.000043  closs: 0.7756 (0.8027)  grad_norm: 0.4211 (0.6174)  time: 5.5774  data: 0.0001  max mem: 71357
+[02:06:49.764956] Epoch: [0]  [5630/6500]  lr: 0.000043  closs: 0.7647 (0.8026)  grad_norm: 0.4478 (0.6173)  time: 5.5708  data: 0.0001  max mem: 71357
+[02:07:45.456020] Epoch: [0]  [5640/6500]  lr: 0.000043  closs: 0.7647 (0.8025)  grad_norm: 0.4073 (0.6169)  time: 5.5673  data: 0.0001  max mem: 71357
+[02:08:41.296090] Epoch: [0]  [5650/6500]  lr: 0.000043  closs: 0.7760 (0.8025)  grad_norm: 0.3969 (0.6166)  time: 5.5764  data: 0.0001  max mem: 71357
+[02:09:37.090246] Epoch: [0]  [5660/6500]  lr: 0.000044  closs: 0.7567 (0.8025)  grad_norm: 0.4379 (0.6163)  time: 5.5816  data: 0.0001  max mem: 71357
+[02:10:32.741323] Epoch: [0]  [5670/6500]  lr: 0.000044  closs: 0.7353 (0.8024)  grad_norm: 0.4004 (0.6161)  time: 5.5721  data: 0.0001  max mem: 71357
+[02:11:28.424749] Epoch: [0]  [5680/6500]  lr: 0.000044  closs: 0.8031 (0.8024)  grad_norm: 0.4379 (0.6159)  time: 5.5666  data: 0.0002  max mem: 71357
+[02:12:24.285486] Epoch: [0]  [5690/6500]  lr: 0.000044  closs: 0.7536 (0.8023)  grad_norm: 0.4379 (0.6155)  time: 5.5771  data: 0.0002  max mem: 71357
+[02:13:19.918884] Epoch: [0]  [5700/6500]  lr: 0.000044  closs: 0.6936 (0.8021)  grad_norm: 0.4161 (0.6153)  time: 5.5746  data: 0.0001  max mem: 71357
+[02:14:15.599778] Epoch: [0]  [5710/6500]  lr: 0.000044  closs: 0.6915 (0.8019)  grad_norm: 0.4161 (0.6149)  time: 5.5656  data: 0.0001  max mem: 71357
+[02:15:11.331649] Epoch: [0]  [5720/6500]  lr: 0.000044  closs: 0.7293 (0.8019)  grad_norm: 0.4047 (0.6147)  time: 5.5705  data: 0.0001  max mem: 71357
+[02:16:07.152160] Epoch: [0]  [5730/6500]  lr: 0.000044  closs: 0.8008 (0.8020)  grad_norm: 0.3836 (0.6144)  time: 5.5775  data: 0.0002  max mem: 71357
+[02:17:03.066211] Epoch: [0]  [5740/6500]  lr: 0.000044  closs: 0.8080 (0.8020)  grad_norm: 0.3720 (0.6140)  time: 5.5866  data: 0.0002  max mem: 71357
+[02:17:58.785745] Epoch: [0]  [5750/6500]  lr: 0.000044  closs: 0.7575 (0.8018)  grad_norm: 0.3820 (0.6138)  time: 5.5816  data: 0.0001  max mem: 71357
+[02:18:54.476351] Epoch: [0]  [5760/6500]  lr: 0.000044  closs: 0.7069 (0.8017)  grad_norm: 0.3731 (0.6134)  time: 5.5704  data: 0.0001  max mem: 71357
+[02:19:50.250044] Epoch: [0]  [5770/6500]  lr: 0.000044  closs: 0.7394 (0.8016)  grad_norm: 0.3789 (0.6132)  time: 5.5731  data: 0.0001  max mem: 71357
+[02:20:46.124223] Epoch: [0]  [5780/6500]  lr: 0.000044  closs: 0.7709 (0.8015)  grad_norm: 0.3789 (0.6128)  time: 5.5823  data: 0.0001  max mem: 71357
+[02:21:41.877082] Epoch: [0]  [5790/6500]  lr: 0.000045  closs: 0.7787 (0.8014)  grad_norm: 0.3731 (0.6127)  time: 5.5813  data: 0.0002  max mem: 71357
+[02:22:37.579383] Epoch: [0]  [5800/6500]  lr: 0.000045  closs: 0.7921 (0.8014)  grad_norm: 0.3789 (0.6125)  time: 5.5727  data: 0.0002  max mem: 71357
+[02:23:33.334032] Epoch: [0]  [5810/6500]  lr: 0.000045  closs: 0.7929 (0.8014)  grad_norm: 0.3504 (0.6121)  time: 5.5727  data: 0.0001  max mem: 71357
+[02:24:29.023544] Epoch: [0]  [5820/6500]  lr: 0.000045  closs: 0.7977 (0.8014)  grad_norm: 0.3907 (0.6119)  time: 5.5721  data: 0.0001  max mem: 71357
+[02:25:24.861645] Epoch: [0]  [5830/6500]  lr: 0.000045  closs: 0.7955 (0.8014)  grad_norm: 0.4042 (0.6119)  time: 5.5763  data: 0.0001  max mem: 71357
+[02:26:20.652863] Epoch: [0]  [5840/6500]  lr: 0.000045  closs: 0.7487 (0.8014)  grad_norm: 0.3819 (0.6115)  time: 5.5813  data: 0.0001  max mem: 71357
+[02:27:16.301506] Epoch: [0]  [5850/6500]  lr: 0.000045  closs: 0.7196 (0.8012)  grad_norm: 0.3847 (0.6113)  time: 5.5719  data: 0.0001  max mem: 71357
+[02:28:12.058501] Epoch: [0]  [5860/6500]  lr: 0.000045  closs: 0.7021 (0.8011)  grad_norm: 0.3812 (0.6109)  time: 5.5702  data: 0.0001  max mem: 71357
+[02:29:07.841507] Epoch: [0]  [5870/6500]  lr: 0.000045  closs: 0.7515 (0.8010)  grad_norm: 0.3859 (0.6106)  time: 5.5769  data: 0.0001  max mem: 71357
+[02:30:03.483436] Epoch: [0]  [5880/6500]  lr: 0.000045  closs: 0.7809 (0.8010)  grad_norm: 0.4351 (0.6106)  time: 5.5711  data: 0.0001  max mem: 71357
+[02:30:59.287191] Epoch: [0]  [5890/6500]  lr: 0.000045  closs: 0.7648 (0.8009)  grad_norm: 0.4266 (0.6104)  time: 5.5722  data: 0.0001  max mem: 71357
+[02:31:55.068642] Epoch: [0]  [5900/6500]  lr: 0.000045  closs: 0.7531 (0.8008)  grad_norm: 0.4351 (0.6100)  time: 5.5791  data: 0.0001  max mem: 71357
+[02:32:50.950525] Epoch: [0]  [5910/6500]  lr: 0.000045  closs: 0.7531 (0.8007)  grad_norm: 0.4113 (0.6096)  time: 5.5831  data: 0.0001  max mem: 71357
+[02:33:46.661662] Epoch: [0]  [5920/6500]  lr: 0.000046  closs: 0.7102 (0.8005)  grad_norm: 0.3950 (0.6096)  time: 5.5796  data: 0.0001  max mem: 71357
+[02:34:42.406758] Epoch: [0]  [5930/6500]  lr: 0.000046  closs: 0.7361 (0.8005)  grad_norm: 0.3965 (0.6096)  time: 5.5727  data: 0.0001  max mem: 71357
+[02:35:38.231017] Epoch: [0]  [5940/6500]  lr: 0.000046  closs: 0.8309 (0.8006)  grad_norm: 0.3901 (0.6092)  time: 5.5783  data: 0.0001  max mem: 71357
+[02:36:34.027442] Epoch: [0]  [5950/6500]  lr: 0.000046  closs: 0.8309 (0.8005)  grad_norm: 0.3965 (0.6088)  time: 5.5809  data: 0.0002  max mem: 71357
+[02:37:30.000045] Epoch: [0]  [5960/6500]  lr: 0.000046  closs: 0.7825 (0.8006)  grad_norm: 0.3745 (0.6084)  time: 5.5883  data: 0.0002  max mem: 71357
+[02:38:25.764491] Epoch: [0]  [5970/6500]  lr: 0.000046  closs: 0.8335 (0.8005)  grad_norm: 0.3616 (0.6081)  time: 5.5868  data: 0.0001  max mem: 71357
+[02:39:21.355752] Epoch: [0]  [5980/6500]  lr: 0.000046  closs: 0.8265 (0.8005)  grad_norm: 0.3835 (0.6082)  time: 5.5677  data: 0.0001  max mem: 71357
+[02:40:17.146222] Epoch: [0]  [5990/6500]  lr: 0.000046  closs: 0.7573 (0.8004)  grad_norm: 0.3745 (0.6079)  time: 5.5690  data: 0.0001  max mem: 71357
+[02:41:12.953638] Epoch: [0]  [6000/6500]  lr: 0.000046  closs: 0.7502 (0.8004)  grad_norm: 0.3969 (0.6075)  time: 5.5798  data: 0.0002  max mem: 71357
+[02:42:08.673781] Epoch: [0]  [6010/6500]  lr: 0.000046  closs: 0.7498 (0.8004)  grad_norm: 0.3891 (0.6072)  time: 5.5762  data: 0.0002  max mem: 71357
+[02:43:04.453165] Epoch: [0]  [6020/6500]  lr: 0.000046  closs: 0.7474 (0.8002)  grad_norm: 0.3891 (0.6068)  time: 5.5749  data: 0.0001  max mem: 71357
+[02:44:00.201410] Epoch: [0]  [6030/6500]  lr: 0.000046  closs: 0.7241 (0.8002)  grad_norm: 0.3969 (0.6066)  time: 5.5763  data: 0.0001  max mem: 71357
+[02:44:55.881083] Epoch: [0]  [6040/6500]  lr: 0.000046  closs: 0.7288 (0.8002)  grad_norm: 0.4122 (0.6063)  time: 5.5713  data: 0.0001  max mem: 71357
+[02:45:51.715473] Epoch: [0]  [6050/6500]  lr: 0.000047  closs: 0.7671 (0.8000)  grad_norm: 0.4315 (0.6062)  time: 5.5756  data: 0.0001  max mem: 71357
+[02:46:47.325180] Epoch: [0]  [6060/6500]  lr: 0.000047  closs: 0.7667 (0.7999)  grad_norm: 0.4321 (0.6060)  time: 5.5721  data: 0.0002  max mem: 71357
+[02:47:42.956637] Epoch: [0]  [6070/6500]  lr: 0.000047  closs: 0.7482 (0.7999)  grad_norm: 0.4387 (0.6060)  time: 5.5620  data: 0.0002  max mem: 71357
+[02:48:38.729897] Epoch: [0]  [6080/6500]  lr: 0.000047  closs: 0.7457 (0.7999)  grad_norm: 0.4835 (0.6058)  time: 5.5701  data: 0.0001  max mem: 71357
+[02:49:34.468997] Epoch: [0]  [6090/6500]  lr: 0.000047  closs: 0.7455 (0.7998)  grad_norm: 0.5015 (0.6058)  time: 5.5755  data: 0.0001  max mem: 71357
+[02:50:30.184089] Epoch: [0]  [6100/6500]  lr: 0.000047  closs: 0.7606 (0.7998)  grad_norm: 0.5218 (0.6058)  time: 5.5726  data: 0.0001  max mem: 71357
+[02:51:25.929815] Epoch: [0]  [6110/6500]  lr: 0.000047  closs: 0.8044 (0.7998)  grad_norm: 0.4483 (0.6054)  time: 5.5729  data: 0.0002  max mem: 71357
+[02:52:21.562615] Epoch: [0]  [6120/6500]  lr: 0.000047  closs: 0.7751 (0.7998)  grad_norm: 0.4406 (0.6052)  time: 5.5688  data: 0.0002  max mem: 71357
+[02:53:17.515875] Epoch: [0]  [6130/6500]  lr: 0.000047  closs: 0.8261 (0.7999)  grad_norm: 0.3885 (0.6049)  time: 5.5792  data: 0.0001  max mem: 71357
+[02:54:13.223511] Epoch: [0]  [6140/6500]  lr: 0.000047  closs: 0.8261 (0.7999)  grad_norm: 0.3792 (0.6049)  time: 5.5830  data: 0.0001  max mem: 71357
+[02:55:08.927662] Epoch: [0]  [6150/6500]  lr: 0.000047  closs: 0.7420 (0.7997)  grad_norm: 0.3861 (0.6051)  time: 5.5705  data: 0.0001  max mem: 71357
+[02:56:04.619859] Epoch: [0]  [6160/6500]  lr: 0.000047  closs: 0.7420 (0.7997)  grad_norm: 0.3739 (0.6047)  time: 5.5697  data: 0.0001  max mem: 71357
+[02:57:00.344520] Epoch: [0]  [6170/6500]  lr: 0.000047  closs: 0.7830 (0.7997)  grad_norm: 0.3740 (0.6044)  time: 5.5707  data: 0.0001  max mem: 71357
+[02:57:56.275405] Epoch: [0]  [6180/6500]  lr: 0.000048  closs: 0.8380 (0.7997)  grad_norm: 0.3740 (0.6042)  time: 5.5827  data: 0.0001  max mem: 71357
+[02:58:52.075593] Epoch: [0]  [6190/6500]  lr: 0.000048  closs: 0.7414 (0.7996)  grad_norm: 0.3430 (0.6037)  time: 5.5865  data: 0.0001  max mem: 71357
+[02:59:47.687443] Epoch: [0]  [6200/6500]  lr: 0.000048  closs: 0.7333 (0.7995)  grad_norm: 0.3656 (0.6035)  time: 5.5705  data: 0.0001  max mem: 71357
+[03:00:43.414828] Epoch: [0]  [6210/6500]  lr: 0.000048  closs: 0.7467 (0.7997)  grad_norm: 0.3530 (0.6032)  time: 5.5669  data: 0.0001  max mem: 71357
+[03:01:39.227260] Epoch: [0]  [6220/6500]  lr: 0.000048  closs: 0.7170 (0.7994)  grad_norm: 0.3656 (0.6029)  time: 5.5769  data: 0.0001  max mem: 71357
+[03:02:35.020515] Epoch: [0]  [6230/6500]  lr: 0.000048  closs: 0.6600 (0.7994)  grad_norm: 0.4024 (0.6027)  time: 5.5802  data: 0.0001  max mem: 71357
+[03:03:30.712438] Epoch: [0]  [6240/6500]  lr: 0.000048  closs: 0.6600 (0.7993)  grad_norm: 0.4024 (0.6024)  time: 5.5742  data: 0.0001  max mem: 71357
+[03:04:26.392385] Epoch: [0]  [6250/6500]  lr: 0.000048  closs: 0.7017 (0.7992)  grad_norm: 0.4194 (0.6022)  time: 5.5685  data: 0.0001  max mem: 71357
+[03:05:22.220010] Epoch: [0]  [6260/6500]  lr: 0.000048  closs: 0.7189 (0.7991)  grad_norm: 0.4248 (0.6019)  time: 5.5753  data: 0.0001  max mem: 71357
+[03:06:18.144391] Epoch: [0]  [6270/6500]  lr: 0.000048  closs: 0.7910 (0.7992)  grad_norm: 0.4248 (0.6016)  time: 5.5875  data: 0.0002  max mem: 71357
+[03:07:13.838284] Epoch: [0]  [6280/6500]  lr: 0.000048  closs: 0.7742 (0.7991)  grad_norm: 0.4084 (0.6013)  time: 5.5808  data: 0.0002  max mem: 71357
+[03:08:09.622986] Epoch: [0]  [6290/6500]  lr: 0.000048  closs: 0.7474 (0.7991)  grad_norm: 0.3958 (0.6010)  time: 5.5738  data: 0.0001  max mem: 71357
+[03:09:05.363224] Epoch: [0]  [6300/6500]  lr: 0.000048  closs: 0.7197 (0.7990)  grad_norm: 0.3744 (0.6008)  time: 5.5761  data: 0.0001  max mem: 71357
+[03:10:01.116113] Epoch: [0]  [6310/6500]  lr: 0.000049  closs: 0.8256 (0.7990)  grad_norm: 0.3958 (0.6005)  time: 5.5745  data: 0.0001  max mem: 71357
+[03:10:56.828047] Epoch: [0]  [6320/6500]  lr: 0.000049  closs: 0.8374 (0.7991)  grad_norm: 0.3957 (0.6003)  time: 5.5731  data: 0.0001  max mem: 71357
+[03:11:52.510798] Epoch: [0]  [6330/6500]  lr: 0.000049  closs: 0.7012 (0.7988)  grad_norm: 0.4162 (0.6001)  time: 5.5696  data: 0.0002  max mem: 71357
+[03:12:48.232518] Epoch: [0]  [6340/6500]  lr: 0.000049  closs: 0.6872 (0.7987)  grad_norm: 0.4371 (0.5999)  time: 5.5701  data: 0.0002  max mem: 71357
+[03:13:43.960654] Epoch: [0]  [6350/6500]  lr: 0.000049  closs: 0.7740 (0.7987)  grad_norm: 0.4367 (0.5996)  time: 5.5724  data: 0.0001  max mem: 71357
+[03:14:39.727757] Epoch: [0]  [6360/6500]  lr: 0.000049  closs: 0.7274 (0.7985)  grad_norm: 0.4371 (0.5994)  time: 5.5747  data: 0.0001  max mem: 71357
+[03:15:35.404672] Epoch: [0]  [6370/6500]  lr: 0.000049  closs: 0.7655 (0.7986)  grad_norm: 0.4357 (0.5992)  time: 5.5721  data: 0.0001  max mem: 71357
+[03:16:31.087842] Epoch: [0]  [6380/6500]  lr: 0.000049  closs: 0.7822 (0.7986)  grad_norm: 0.4313 (0.5989)  time: 5.5679  data: 0.0002  max mem: 71357
+[03:17:26.822736] Epoch: [0]  [6390/6500]  lr: 0.000049  closs: 0.7659 (0.7985)  grad_norm: 0.4313 (0.5989)  time: 5.5708  data: 0.0002  max mem: 71357
+[03:18:22.577493] Epoch: [0]  [6400/6500]  lr: 0.000049  closs: 0.6885 (0.7984)  grad_norm: 0.4265 (0.5989)  time: 5.5744  data: 0.0001  max mem: 71357
+[03:19:18.330428] Epoch: [0]  [6410/6500]  lr: 0.000049  closs: 0.7096 (0.7982)  grad_norm: 0.4265 (0.5987)  time: 5.5753  data: 0.0001  max mem: 71357
+[03:20:14.076963] Epoch: [0]  [6420/6500]  lr: 0.000049  closs: 0.7532 (0.7982)  grad_norm: 0.4265 (0.5984)  time: 5.5749  data: 0.0001  max mem: 71357
+[03:21:09.837780] Epoch: [0]  [6430/6500]  lr: 0.000049  closs: 0.7365 (0.7980)  grad_norm: 0.4245 (0.5982)  time: 5.5753  data: 0.0001  max mem: 71357
+[03:22:05.632480] Epoch: [0]  [6440/6500]  lr: 0.000050  closs: 0.7185 (0.7980)  grad_norm: 0.4245 (0.5980)  time: 5.5777  data: 0.0001  max mem: 71357
+[03:23:01.288584] Epoch: [0]  [6450/6500]  lr: 0.000050  closs: 0.7666 (0.7979)  grad_norm: 0.4007 (0.5978)  time: 5.5724  data: 0.0001  max mem: 71357
+[03:23:56.940427] Epoch: [0]  [6460/6500]  lr: 0.000050  closs: 0.7144 (0.7978)  grad_norm: 0.3927 (0.5975)  time: 5.5653  data: 0.0001  max mem: 71357
+[03:24:52.591248] Epoch: [0]  [6470/6500]  lr: 0.000050  closs: 0.7340 (0.7977)  grad_norm: 0.4434 (0.5979)  time: 5.5650  data: 0.0001  max mem: 71357
+[03:25:48.307940] Epoch: [0]  [6480/6500]  lr: 0.000050  closs: 0.7340 (0.7977)  grad_norm: 0.4148 (0.5983)  time: 5.5683  data: 0.0001  max mem: 71357
+[03:26:44.126298] Epoch: [0]  [6490/6500]  lr: 0.000050  closs: 0.7379 (0.7977)  grad_norm: 0.4968 (0.5982)  time: 5.5767  data: 0.0002  max mem: 71357
+[03:27:34.661033] Epoch: [0] Total time: 10:04:03
+[03:27:34.691953] Averaged stats: lr: 0.000050  closs: 0.7927 (0.7980)  grad_norm: 0.4559 (0.5980)
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[03:27:34.851776] model saved
+/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[03:27:35.757911] optimizer saved
+[03:27:35.758397] other rank-common saved
+[03:27:35.761691] rank-specific saved
+[03:27:35.770694] log_dir: ./output_dir
+[03:27:43.916967] Epoch: [1]  [0/6500]  lr: 0.000050  closs: 0.6486 (0.6486)  time: 8.1455  data: 2.5052  max mem: 71357
+[03:28:39.555610] Epoch: [1]  [10/6500]  lr: 0.000050  closs: 0.7239 (0.7205)  grad_norm: 0.3962 (0.4080)  time: 5.7985  data: 0.2279  max mem: 71357
+[03:29:35.233876] Epoch: [1]  [20/6500]  lr: 0.000050  closs: 0.8120 (0.7878)  grad_norm: 0.4027 (0.4338)  time: 5.5657  data: 0.0001  max mem: 71357
+[03:30:31.042993] Epoch: [1]  [30/6500]  lr: 0.000050  closs: 0.8128 (0.7810)  grad_norm: 0.4141 (0.4367)  time: 5.5743  data: 0.0001  max mem: 71357
+[03:31:26.778182] Epoch: [1]  [40/6500]  lr: 0.000050  closs: 0.7600 (0.7738)  grad_norm: 0.4286 (0.4379)  time: 5.5771  data: 0.0001  max mem: 71357
+[03:32:22.493128] Epoch: [1]  [50/6500]  lr: 0.000050  closs: 0.7715 (0.7786)  grad_norm: 0.4286 (0.4348)  time: 5.5724  data: 0.0001  max mem: 71357
+[03:33:18.164718] Epoch: [1]  [60/6500]  lr: 0.000050  closs: 0.7715 (0.7727)  grad_norm: 0.4141 (0.4459)  time: 5.5692  data: 0.0001  max mem: 71357
+[03:34:13.800199] Epoch: [1]  [70/6500]  lr: 0.000050  closs: 0.7570 (0.7741)  grad_norm: 0.4129 (0.4442)  time: 5.5652  data: 0.0001  max mem: 71357
+[03:35:09.583280] Epoch: [1]  [80/6500]  lr: 0.000050  closs: 0.7335 (0.7625)  grad_norm: 0.4129 (0.4435)  time: 5.5708  data: 0.0001  max mem: 71357
+[03:36:05.465429] Epoch: [1]  [90/6500]  lr: 0.000050  closs: 0.7385 (0.7681)  grad_norm: 0.4266 (0.4414)  time: 5.5832  data: 0.0001  max mem: 71357
+[03:37:01.196399] Epoch: [1]  [100/6500]  lr: 0.000050  closs: 0.7711 (0.7741)  grad_norm: 0.3775 (0.4462)  time: 5.5806  data: 0.0001  max mem: 71357
+[03:37:56.957142] Epoch: [1]  [110/6500]  lr: 0.000050  closs: 0.7738 (0.7750)  grad_norm: 0.3745 (0.4366)  time: 5.5745  data: 0.0001  max mem: 71357
+[03:38:52.693091] Epoch: [1]  [120/6500]  lr: 0.000050  closs: 0.7361 (0.7703)  grad_norm: 0.3729 (0.4461)  time: 5.5747  data: 0.0001  max mem: 71357
+[03:39:48.482751] Epoch: [1]  [130/6500]  lr: 0.000050  closs: 0.7085 (0.7608)  grad_norm: 0.3710 (0.4497)  time: 5.5762  data: 0.0001  max mem: 71357
+[03:40:44.069389] Epoch: [1]  [140/6500]  lr: 0.000050  closs: 0.6935 (0.7555)  grad_norm: 0.3846 (0.4603)  time: 5.5687  data: 0.0001  max mem: 71357
+[03:41:39.808776] Epoch: [1]  [150/6500]  lr: 0.000050  closs: 0.7341 (0.7603)  grad_norm: 0.3978 (0.4595)  time: 5.5662  data: 0.0001  max mem: 71357
+[03:42:35.446981] Epoch: [1]  [160/6500]  lr: 0.000050  closs: 0.8180 (0.7645)  grad_norm: 0.4602 (0.4629)  time: 5.5688  data: 0.0001  max mem: 71357
+[03:43:31.177187] Epoch: [1]  [170/6500]  lr: 0.000050  closs: 0.7881 (0.7646)  grad_norm: 0.4495 (0.4614)  time: 5.5683  data: 0.0001  max mem: 71357
+[03:44:26.897683] Epoch: [1]  [180/6500]  lr: 0.000050  closs: 0.8055 (0.7686)  grad_norm: 0.4495 (0.4677)  time: 5.5724  data: 0.0001  max mem: 71357
+[03:45:22.545885] Epoch: [1]  [190/6500]  lr: 0.000050  closs: 0.7508 (0.7662)  grad_norm: 0.4817 (0.4761)  time: 5.5683  data: 0.0001  max mem: 71357
+[03:46:18.218652] Epoch: [1]  [200/6500]  lr: 0.000050  closs: 0.7508 (0.7668)  grad_norm: 0.4273 (0.4732)  time: 5.5660  data: 0.0001  max mem: 71357
+[03:47:13.969467] Epoch: [1]  [210/6500]  lr: 0.000050  closs: 0.7564 (0.7684)  grad_norm: 0.4665 (0.4759)  time: 5.5711  data: 0.0001  max mem: 71357
+[03:48:09.716365] Epoch: [1]  [220/6500]  lr: 0.000050  closs: 0.7529 (0.7664)  grad_norm: 0.4229 (0.4752)  time: 5.5748  data: 0.0001  max mem: 71357
+[03:49:05.445622] Epoch: [1]  [230/6500]  lr: 0.000050  closs: 0.7457 (0.7659)  grad_norm: 0.3906 (0.4711)  time: 5.5737  data: 0.0001  max mem: 71357
+[03:50:01.108007] Epoch: [1]  [240/6500]  lr: 0.000050  closs: 0.7287 (0.7642)  grad_norm: 0.4106 (0.4707)  time: 5.5695  data: 0.0001  max mem: 71357
+[03:50:56.937160] Epoch: [1]  [250/6500]  lr: 0.000050  closs: 0.7201 (0.7644)  grad_norm: 0.4106 (0.6624)  time: 5.5745  data: 0.0001  max mem: 71357
+[03:51:52.784701] Epoch: [1]  [260/6500]  lr: 0.000050  closs: 0.7706 (0.7645)  grad_norm: 0.4106 (0.6523)  time: 5.5837  data: 0.0001  max mem: 71357
+[03:52:48.462413] Epoch: [1]  [270/6500]  lr: 0.000050  closs: 0.7976 (0.7668)  grad_norm: 0.4267 (0.6459)  time: 5.5761  data: 0.0001  max mem: 71357
+[03:53:44.119411] Epoch: [1]  [280/6500]  lr: 0.000050  closs: 0.7421 (0.7669)  grad_norm: 0.4267 (0.6423)  time: 5.5666  data: 0.0002  max mem: 71357
+[03:54:39.776151] Epoch: [1]  [290/6500]  lr: 0.000050  closs: 0.7414 (0.7677)  grad_norm: 0.4362 (0.6379)  time: 5.5656  data: 0.0002  max mem: 71357
+[03:55:35.536753] Epoch: [1]  [300/6500]  lr: 0.000050  closs: 0.7593 (0.7666)  grad_norm: 0.5156 (0.6368)  time: 5.5707  data: 0.0001  max mem: 71357
+[03:56:31.295194] Epoch: [1]  [310/6500]  lr: 0.000050  closs: 0.7351 (0.7641)  grad_norm: 0.4603 (0.6320)  time: 5.5758  data: 0.0001  max mem: 71357
+[03:57:26.899484] Epoch: [1]  [320/6500]  lr: 0.000050  closs: 0.7863 (0.7665)  grad_norm: 0.4362 (0.6254)  time: 5.5680  data: 0.0001  max mem: 71357
+[03:58:22.621113] Epoch: [1]  [330/6500]  lr: 0.000050  closs: 0.7725 (0.7660)  grad_norm: 0.4302 (0.6213)  time: 5.5662  data: 0.0002  max mem: 71357
+[03:59:18.439876] Epoch: [1]  [340/6500]  lr: 0.000050  closs: 0.7531 (0.7661)  grad_norm: 0.3934 (0.6146)  time: 5.5769  data: 0.0002  max mem: 71357
+[04:00:14.348413] Epoch: [1]  [350/6500]  lr: 0.000050  closs: 0.7826 (0.7659)  grad_norm: 0.3847 (0.6065)  time: 5.5862  data: 0.0001  max mem: 71357
+[04:01:09.980901] Epoch: [1]  [360/6500]  lr: 0.000050  closs: 0.8143 (0.7690)  grad_norm: 0.3870 (0.6030)  time: 5.5769  data: 0.0001  max mem: 71357
+[04:02:05.648563] Epoch: [1]  [370/6500]  lr: 0.000050  closs: 0.7969 (0.7674)  grad_norm: 0.3870 (0.6001)  time: 5.5649  data: 0.0001  max mem: 71357
+[04:03:01.348936] Epoch: [1]  [380/6500]  lr: 0.000050  closs: 0.7290 (0.7683)  grad_norm: 0.4082 (0.5945)  time: 5.5683  data: 0.0001  max mem: 71357
+[04:03:57.052644] Epoch: [1]  [390/6500]  lr: 0.000050  closs: 0.7591 (0.7676)  grad_norm: 0.4110 (0.5896)  time: 5.5701  data: 0.0001  max mem: 71357
+[04:04:52.824329] Epoch: [1]  [400/6500]  lr: 0.000050  closs: 0.7321 (0.7674)  grad_norm: 0.3754 (0.5860)  time: 5.5736  data: 0.0001  max mem: 71357
+[04:05:48.442381] Epoch: [1]  [410/6500]  lr: 0.000050  closs: 0.7504 (0.7677)  grad_norm: 0.4082 (0.5832)  time: 5.5694  data: 0.0001  max mem: 71357
+[04:06:44.156813] Epoch: [1]  [420/6500]  lr: 0.000050  closs: 0.7621 (0.7684)  grad_norm: 0.3807 (0.5788)  time: 5.5665  data: 0.0001  max mem: 71357
+[04:07:39.800804] Epoch: [1]  [430/6500]  lr: 0.000050  closs: 0.7435 (0.7668)  grad_norm: 0.4047 (0.5767)  time: 5.5678  data: 0.0001  max mem: 71357
+[04:08:35.582184] Epoch: [1]  [440/6500]  lr: 0.000050  closs: 0.7382 (0.7663)  grad_norm: 0.4047 (0.5735)  time: 5.5712  data: 0.0001  max mem: 71357
+[04:09:31.256070] Epoch: [1]  [450/6500]  lr: 0.000050  closs: 0.7414 (0.7663)  grad_norm: 0.3852 (0.5718)  time: 5.5727  data: 0.0001  max mem: 71357
+[04:10:26.919187] Epoch: [1]  [460/6500]  lr: 0.000050  closs: 0.7273 (0.7660)  grad_norm: 0.4047 (0.5691)  time: 5.5667  data: 0.0001  max mem: 71357
+[04:11:22.572352] Epoch: [1]  [470/6500]  lr: 0.000050  closs: 0.7546 (0.7670)  grad_norm: 0.4096 (0.5692)  time: 5.5657  data: 0.0001  max mem: 71357
+[04:12:18.379459] Epoch: [1]  [480/6500]  lr: 0.000050  closs: 0.7764 (0.7670)  grad_norm: 0.4096 (0.5649)  time: 5.5729  data: 0.0001  max mem: 71357
+[04:13:14.102252] Epoch: [1]  [490/6500]  lr: 0.000050  closs: 0.7591 (0.7662)  grad_norm: 0.4096 (0.5617)  time: 5.5764  data: 0.0001  max mem: 71357
+[04:14:09.753488] Epoch: [1]  [500/6500]  lr: 0.000050  closs: 0.7008 (0.7658)  grad_norm: 0.4468 (0.5610)  time: 5.5686  data: 0.0001  max mem: 71357
+[04:15:05.414976] Epoch: [1]  [510/6500]  lr: 0.000050  closs: 0.7568 (0.7650)  grad_norm: 0.3889 (0.5583)  time: 5.5655  data: 0.0001  max mem: 71357
+[04:16:01.166383] Epoch: [1]  [520/6500]  lr: 0.000050  closs: 0.7262 (0.7643)  grad_norm: 0.4508 (0.5579)  time: 5.5705  data: 0.0001  max mem: 71357
+[04:16:56.919801] Epoch: [1]  [530/6500]  lr: 0.000050  closs: 0.6799 (0.7621)  grad_norm: 0.4514 (0.5565)  time: 5.5752  data: 0.0001  max mem: 71357
+[04:17:52.660053] Epoch: [1]  [540/6500]  lr: 0.000050  closs: 0.6093 (0.7599)  grad_norm: 0.4378 (0.5536)  time: 5.5746  data: 0.0001  max mem: 71357
+[04:18:48.328201] Epoch: [1]  [550/6500]  lr: 0.000050  closs: 0.6526 (0.7593)  grad_norm: 0.4489 (0.5519)  time: 5.5704  data: 0.0002  max mem: 71357
+[04:19:44.116985] Epoch: [1]  [560/6500]  lr: 0.000050  closs: 0.7426 (0.7588)  grad_norm: 0.3843 (0.5489)  time: 5.5728  data: 0.0002  max mem: 71357
+[04:20:39.824567] Epoch: [1]  [570/6500]  lr: 0.000050  closs: 0.7211 (0.7596)  grad_norm: 0.3797 (0.5466)  time: 5.5747  data: 0.0001  max mem: 71357
+[04:21:35.492317] Epoch: [1]  [580/6500]  lr: 0.000050  closs: 0.7623 (0.7591)  grad_norm: 0.3837 (0.5454)  time: 5.5687  data: 0.0001  max mem: 71357
+[04:22:31.256691] Epoch: [1]  [590/6500]  lr: 0.000050  closs: 0.7601 (0.7591)  grad_norm: 0.3874 (0.5442)  time: 5.5715  data: 0.0001  max mem: 71357
+[04:23:27.024625] Epoch: [1]  [600/6500]  lr: 0.000050  closs: 0.7601 (0.7599)  grad_norm: 0.4280 (0.5442)  time: 5.5765  data: 0.0001  max mem: 71357
+[04:24:22.754492] Epoch: [1]  [610/6500]  lr: 0.000050  closs: 0.7729 (0.7613)  grad_norm: 0.4293 (0.5421)  time: 5.5748  data: 0.0001  max mem: 71357
+[04:25:18.641839] Epoch: [1]  [620/6500]  lr: 0.000050  closs: 0.7729 (0.7617)  grad_norm: 0.4281 (0.5403)  time: 5.5808  data: 0.0001  max mem: 71357
+[04:26:14.504595] Epoch: [1]  [630/6500]  lr: 0.000050  closs: 0.7718 (0.7620)  grad_norm: 0.4043 (0.5377)  time: 5.5874  data: 0.0001  max mem: 71357
+[04:27:10.222475] Epoch: [1]  [640/6500]  lr: 0.000050  closs: 0.7718 (0.7624)  grad_norm: 0.4043 (0.5370)  time: 5.5789  data: 0.0001  max mem: 71357
+[04:28:05.954345] Epoch: [1]  [650/6500]  lr: 0.000050  closs: 0.7636 (0.7626)  grad_norm: 0.3995 (0.5348)  time: 5.5724  data: 0.0001  max mem: 71357
+[04:29:01.871861] Epoch: [1]  [660/6500]  lr: 0.000050  closs: 0.7671 (0.7637)  grad_norm: 0.3870 (0.5328)  time: 5.5824  data: 0.0002  max mem: 71357
+[04:29:57.593541] Epoch: [1]  [670/6500]  lr: 0.000050  closs: 0.7871 (0.7652)  grad_norm: 0.4053 (0.5317)  time: 5.5819  data: 0.0002  max mem: 71357
+[04:30:53.271954] Epoch: [1]  [680/6500]  lr: 0.000050  closs: 0.7538 (0.7653)  grad_norm: 0.3957 (0.5294)  time: 5.5699  data: 0.0001  max mem: 71357
+[04:31:49.060636] Epoch: [1]  [690/6500]  lr: 0.000050  closs: 0.7395 (0.7658)  grad_norm: 0.3957 (0.5279)  time: 5.5733  data: 0.0001  max mem: 71357
+[04:32:44.827788] Epoch: [1]  [700/6500]  lr: 0.000050  closs: 0.7529 (0.7662)  grad_norm: 0.3957 (0.5283)  time: 5.5777  data: 0.0001  max mem: 71357
+[04:33:40.570340] Epoch: [1]  [710/6500]  lr: 0.000050  closs: 0.7456 (0.7660)  grad_norm: 0.3900 (0.5295)  time: 5.5754  data: 0.0002  max mem: 71357
+[04:34:36.343032] Epoch: [1]  [720/6500]  lr: 0.000050  closs: 0.7456 (0.7663)  grad_norm: 0.3888 (0.5283)  time: 5.5756  data: 0.0002  max mem: 71357
+[04:35:32.039216] Epoch: [1]  [730/6500]  lr: 0.000050  closs: 0.7721 (0.7665)  grad_norm: 0.4204 (0.5287)  time: 5.5733  data: 0.0001  max mem: 71357
+[04:36:27.753192] Epoch: [1]  [740/6500]  lr: 0.000050  closs: 0.7721 (0.7673)  grad_norm: 0.4113 (0.5270)  time: 5.5704  data: 0.0001  max mem: 71357
+[04:37:23.585540] Epoch: [1]  [750/6500]  lr: 0.000050  closs: 0.7719 (0.7677)  grad_norm: 0.3883 (0.5252)  time: 5.5772  data: 0.0001  max mem: 71357
+[04:38:19.280542] Epoch: [1]  [760/6500]  lr: 0.000050  closs: 0.7400 (0.7679)  grad_norm: 0.3817 (0.5245)  time: 5.5763  data: 0.0001  max mem: 71357
+[04:39:14.994009] Epoch: [1]  [770/6500]  lr: 0.000050  closs: 0.7093 (0.7682)  grad_norm: 0.3817 (0.5237)  time: 5.5703  data: 0.0001  max mem: 71357
+[04:40:10.694587] Epoch: [1]  [780/6500]  lr: 0.000050  closs: 0.7811 (0.7686)  grad_norm: 0.3984 (0.5226)  time: 5.5706  data: 0.0001  max mem: 71357
+[04:41:06.527247] Epoch: [1]  [790/6500]  lr: 0.000050  closs: 0.7501 (0.7679)  grad_norm: 0.4039 (0.5212)  time: 5.5766  data: 0.0001  max mem: 71357
+[04:42:02.287497] Epoch: [1]  [800/6500]  lr: 0.000050  closs: 0.7254 (0.7673)  grad_norm: 0.4044 (0.5203)  time: 5.5795  data: 0.0001  max mem: 71357
+[04:42:57.958012] Epoch: [1]  [810/6500]  lr: 0.000050  closs: 0.6886 (0.7670)  grad_norm: 0.4039 (0.5206)  time: 5.5714  data: 0.0001  max mem: 71357
+[04:43:53.709271] Epoch: [1]  [820/6500]  lr: 0.000050  closs: 0.6873 (0.7668)  grad_norm: 0.3714 (0.5228)  time: 5.5710  data: 0.0001  max mem: 71357
+[04:44:49.456416] Epoch: [1]  [830/6500]  lr: 0.000050  closs: 0.7426 (0.7667)  grad_norm: 0.4044 (0.5218)  time: 5.5748  data: 0.0001  max mem: 71357
+[04:45:45.250753] Epoch: [1]  [840/6500]  lr: 0.000050  closs: 0.7789 (0.7668)  grad_norm: 0.4001 (0.5215)  time: 5.5770  data: 0.0001  max mem: 71357
+[04:46:40.949223] Epoch: [1]  [850/6500]  lr: 0.000050  closs: 0.7471 (0.7662)  grad_norm: 0.3780 (0.5200)  time: 5.5746  data: 0.0001  max mem: 71357
+[04:47:36.561157] Epoch: [1]  [860/6500]  lr: 0.000050  closs: 0.7247 (0.7658)  grad_norm: 0.4001 (0.5197)  time: 5.5654  data: 0.0001  max mem: 71357
+[04:48:32.308653] Epoch: [1]  [870/6500]  lr: 0.000050  closs: 0.7587 (0.7662)  grad_norm: 0.4001 (0.5201)  time: 5.5679  data: 0.0002  max mem: 71357
+[04:49:28.149376] Epoch: [1]  [880/6500]  lr: 0.000050  closs: 0.7531 (0.7667)  grad_norm: 0.3780 (0.5181)  time: 5.5793  data: 0.0002  max mem: 71357
+[04:50:23.950718] Epoch: [1]  [890/6500]  lr: 0.000050  closs: 0.7416 (0.7664)  grad_norm: 0.3905 (0.5193)  time: 5.5820  data: 0.0001  max mem: 71357
+[04:51:19.670794] Epoch: [1]  [900/6500]  lr: 0.000050  closs: 0.7291 (0.7655)  grad_norm: 0.3578 (0.5184)  time: 5.5760  data: 0.0001  max mem: 71357
+[04:52:15.464157] Epoch: [1]  [910/6500]  lr: 0.000050  closs: 0.7453 (0.7650)  grad_norm: 0.3669 (0.5174)  time: 5.5756  data: 0.0001  max mem: 71357
+[04:53:11.330205] Epoch: [1]  [920/6500]  lr: 0.000050  closs: 0.7024 (0.7651)  grad_norm: 0.3843 (0.5163)  time: 5.5828  data: 0.0001  max mem: 71357
+[04:54:07.010433] Epoch: [1]  [930/6500]  lr: 0.000050  closs: 0.7381 (0.7650)  grad_norm: 0.3768 (0.5146)  time: 5.5772  data: 0.0002  max mem: 71357
+[04:55:02.643380] Epoch: [1]  [940/6500]  lr: 0.000050  closs: 0.7571 (0.7650)  grad_norm: 0.3843 (0.5142)  time: 5.5656  data: 0.0002  max mem: 71357
+[04:55:58.347304] Epoch: [1]  [950/6500]  lr: 0.000050  closs: 0.7475 (0.7644)  grad_norm: 0.3946 (0.5133)  time: 5.5668  data: 0.0001  max mem: 71357
+[04:56:54.139194] Epoch: [1]  [960/6500]  lr: 0.000050  closs: 0.7475 (0.7651)  grad_norm: 0.3654 (0.5122)  time: 5.5747  data: 0.0001  max mem: 71357
+[04:57:50.039666] Epoch: [1]  [970/6500]  lr: 0.000050  closs: 0.7587 (0.7649)  grad_norm: 0.3891 (0.5117)  time: 5.5845  data: 0.0001  max mem: 71357
+[04:58:45.709094] Epoch: [1]  [980/6500]  lr: 0.000050  closs: 0.7098 (0.7645)  grad_norm: 0.3654 (0.5107)  time: 5.5784  data: 0.0001  max mem: 71357
+[04:59:41.325212] Epoch: [1]  [990/6500]  lr: 0.000050  closs: 0.7216 (0.7645)  grad_norm: 0.3654 (0.5096)  time: 5.5642  data: 0.0001  max mem: 71357
+[05:00:36.996785] Epoch: [1]  [1000/6500]  lr: 0.000050  closs: 0.7500 (0.7646)  grad_norm: 0.3880 (0.5092)  time: 5.5643  data: 0.0001  max mem: 71357
+[05:01:32.817052] Epoch: [1]  [1010/6500]  lr: 0.000050  closs: 0.7316 (0.7644)  grad_norm: 0.3880 (0.5102)  time: 5.5745  data: 0.0001  max mem: 71357
+[05:02:28.530120] Epoch: [1]  [1020/6500]  lr: 0.000050  closs: 0.7286 (0.7641)  grad_norm: 0.4530 (0.5110)  time: 5.5766  data: 0.0001  max mem: 71357
+[05:03:24.118826] Epoch: [1]  [1030/6500]  lr: 0.000050  closs: 0.7875 (0.7641)  grad_norm: 0.4651 (0.5107)  time: 5.5650  data: 0.0001  max mem: 71357
+[05:04:19.728286] Epoch: [1]  [1040/6500]  lr: 0.000050  closs: 0.7858 (0.7640)  grad_norm: 0.5125 (0.5102)  time: 5.5598  data: 0.0001  max mem: 71357
+[05:05:15.444436] Epoch: [1]  [1050/6500]  lr: 0.000050  closs: 0.7511 (0.7631)  grad_norm: 0.4293 (0.5093)  time: 5.5662  data: 0.0001  max mem: 71357
+[05:06:11.258329] Epoch: [1]  [1060/6500]  lr: 0.000050  closs: 0.6617 (0.7621)  grad_norm: 0.4393 (0.5098)  time: 5.5764  data: 0.0001  max mem: 71357
+[05:07:06.968795] Epoch: [1]  [1070/6500]  lr: 0.000050  closs: 0.6714 (0.7617)  grad_norm: 0.4393 (0.5090)  time: 5.5761  data: 0.0001  max mem: 71357
+[05:08:02.608905] Epoch: [1]  [1080/6500]  lr: 0.000050  closs: 0.6980 (0.7620)  grad_norm: 0.4393 (0.5091)  time: 5.5675  data: 0.0001  max mem: 71357
+[05:08:58.333156] Epoch: [1]  [1090/6500]  lr: 0.000050  closs: 0.7979 (0.7625)  grad_norm: 0.4444 (0.5083)  time: 5.5681  data: 0.0002  max mem: 71357
+[05:09:54.158949] Epoch: [1]  [1100/6500]  lr: 0.000050  closs: 0.7641 (0.7622)  grad_norm: 0.4184 (0.5078)  time: 5.5774  data: 0.0002  max mem: 71357
+[05:10:49.934142] Epoch: [1]  [1110/6500]  lr: 0.000050  closs: 0.7720 (0.7626)  grad_norm: 0.4444 (0.5076)  time: 5.5800  data: 0.0001  max mem: 71357
+[05:11:45.606270] Epoch: [1]  [1120/6500]  lr: 0.000050  closs: 0.7850 (0.7630)  grad_norm: 0.4184 (0.5068)  time: 5.5723  data: 0.0001  max mem: 71357
+[05:12:41.469541] Epoch: [1]  [1130/6500]  lr: 0.000050  closs: 0.7606 (0.7631)  grad_norm: 0.4232 (0.5069)  time: 5.5767  data: 0.0001  max mem: 71357
+[05:13:37.226239] Epoch: [1]  [1140/6500]  lr: 0.000050  closs: 0.7306 (0.7627)  grad_norm: 0.4617 (0.5069)  time: 5.5809  data: 0.0002  max mem: 71357
+[05:14:32.916965] Epoch: [1]  [1150/6500]  lr: 0.000050  closs: 0.7464 (0.7632)  grad_norm: 0.4232 (0.5061)  time: 5.5723  data: 0.0002  max mem: 71357
+[05:15:28.589173] Epoch: [1]  [1160/6500]  lr: 0.000050  closs: 0.8070 (0.7637)  grad_norm: 0.4654 (0.5071)  time: 5.5680  data: 0.0001  max mem: 71357
+[05:16:24.252719] Epoch: [1]  [1170/6500]  lr: 0.000050  closs: 0.7713 (0.7635)  grad_norm: 0.4364 (0.5061)  time: 5.5667  data: 0.0001  max mem: 71357
+[05:17:19.928322] Epoch: [1]  [1180/6500]  lr: 0.000050  closs: 0.6577 (0.7623)  grad_norm: 0.4311 (0.5059)  time: 5.5669  data: 0.0001  max mem: 71357
+[05:18:15.678514] Epoch: [1]  [1190/6500]  lr: 0.000050  closs: 0.6944 (0.7620)  grad_norm: 0.4312 (0.5049)  time: 5.5712  data: 0.0001  max mem: 71357
+[05:19:11.287718] Epoch: [1]  [1200/6500]  lr: 0.000050  closs: 0.7023 (0.7611)  grad_norm: 0.4168 (0.5048)  time: 5.5679  data: 0.0001  max mem: 71357
+[05:20:06.965210] Epoch: [1]  [1210/6500]  lr: 0.000050  closs: 0.7447 (0.7617)  grad_norm: 0.4043 (0.5037)  time: 5.5642  data: 0.0001  max mem: 71357
+[05:21:02.647004] Epoch: [1]  [1220/6500]  lr: 0.000050  closs: 0.8533 (0.7631)  grad_norm: 0.4043 (0.5091)  time: 5.5678  data: 0.0001  max mem: 71357
+[05:21:58.423989] Epoch: [1]  [1230/6500]  lr: 0.000050  closs: 0.8409 (0.7629)  grad_norm: 0.4043 (0.5136)  time: 5.5728  data: 0.0001  max mem: 71357
+[05:22:54.043709] Epoch: [1]  [1240/6500]  lr: 0.000050  closs: 0.7011 (0.7626)  grad_norm: 0.4162 (0.5134)  time: 5.5698  data: 0.0001  max mem: 71357
+[05:23:49.814775] Epoch: [1]  [1250/6500]  lr: 0.000050  closs: 0.7408 (0.7626)  grad_norm: 0.4162 (0.5124)  time: 5.5695  data: 0.0001  max mem: 71357
+[05:24:45.428712] Epoch: [1]  [1260/6500]  lr: 0.000050  closs: 0.7448 (0.7624)  grad_norm: 0.4091 (0.5116)  time: 5.5691  data: 0.0001  max mem: 71357
+[05:25:41.203614] Epoch: [1]  [1270/6500]  lr: 0.000050  closs: 0.7227 (0.7624)  grad_norm: 0.4091 (0.5118)  time: 5.5693  data: 0.0001  max mem: 71357
+[05:26:37.014189] Epoch: [1]  [1280/6500]  lr: 0.000050  closs: 0.7570 (0.7625)  grad_norm: 0.4033 (0.5109)  time: 5.5792  data: 0.0001  max mem: 71357
+[05:27:32.726225] Epoch: [1]  [1290/6500]  lr: 0.000050  closs: 0.7570 (0.7623)  grad_norm: 0.4306 (0.5116)  time: 5.5760  data: 0.0001  max mem: 71357
+[05:28:28.581411] Epoch: [1]  [1300/6500]  lr: 0.000050  closs: 0.7812 (0.7627)  grad_norm: 0.4184 (0.5103)  time: 5.5783  data: 0.0001  max mem: 71357
+[05:29:24.296988] Epoch: [1]  [1310/6500]  lr: 0.000050  closs: 0.7503 (0.7624)  grad_norm: 0.4029 (0.5096)  time: 5.5785  data: 0.0001  max mem: 71357
+[05:30:20.071193] Epoch: [1]  [1320/6500]  lr: 0.000049  closs: 0.7246 (0.7624)  grad_norm: 0.4029 (0.5091)  time: 5.5744  data: 0.0001  max mem: 71357
+[05:31:15.692148] Epoch: [1]  [1330/6500]  lr: 0.000049  closs: 0.7118 (0.7625)  grad_norm: 0.3548 (0.5083)  time: 5.5697  data: 0.0001  max mem: 71357
+[05:32:11.408467] Epoch: [1]  [1340/6500]  lr: 0.000049  closs: 0.7109 (0.7621)  grad_norm: 0.3788 (0.5074)  time: 5.5667  data: 0.0001  max mem: 71357
+[05:33:07.091654] Epoch: [1]  [1350/6500]  lr: 0.000049  closs: 0.7109 (0.7621)  grad_norm: 0.3984 (0.5070)  time: 5.5699  data: 0.0001  max mem: 71357
+[05:34:02.874850] Epoch: [1]  [1360/6500]  lr: 0.000049  closs: 0.7131 (0.7629)  grad_norm: 0.3984 (0.5061)  time: 5.5732  data: 0.0001  max mem: 71357
+[05:34:58.543548] Epoch: [1]  [1370/6500]  lr: 0.000049  closs: 0.7400 (0.7626)  grad_norm: 0.3952 (0.5052)  time: 5.5725  data: 0.0001  max mem: 71357
+[05:35:54.208343] Epoch: [1]  [1380/6500]  lr: 0.000049  closs: 0.7499 (0.7630)  grad_norm: 0.3952 (0.5043)  time: 5.5666  data: 0.0001  max mem: 71357
+[05:36:49.991759] Epoch: [1]  [1390/6500]  lr: 0.000049  closs: 0.7824 (0.7628)  grad_norm: 0.3701 (0.5033)  time: 5.5723  data: 0.0001  max mem: 71357
+[05:37:45.788654] Epoch: [1]  [1400/6500]  lr: 0.000049  closs: 0.7824 (0.7628)  grad_norm: 0.3727 (0.5032)  time: 5.5789  data: 0.0001  max mem: 71357
+[05:38:41.516767] Epoch: [1]  [1410/6500]  lr: 0.000049  closs: 0.8001 (0.7629)  grad_norm: 0.3895 (0.5029)  time: 5.5761  data: 0.0001  max mem: 71357
+[05:39:37.272883] Epoch: [1]  [1420/6500]  lr: 0.000049  closs: 0.8019 (0.7634)  grad_norm: 0.3895 (0.5022)  time: 5.5741  data: 0.0001  max mem: 71357
+[05:40:32.946983] Epoch: [1]  [1430/6500]  lr: 0.000049  closs: 0.7728 (0.7634)  grad_norm: 0.4048 (0.5015)  time: 5.5714  data: 0.0001  max mem: 71357
+[05:41:28.638478] Epoch: [1]  [1440/6500]  lr: 0.000049  closs: 0.6953 (0.7633)  grad_norm: 0.4000 (0.5010)  time: 5.5682  data: 0.0001  max mem: 71357
+[05:42:24.408111] Epoch: [1]  [1450/6500]  lr: 0.000049  closs: 0.7377 (0.7630)  grad_norm: 0.4000 (0.5005)  time: 5.5729  data: 0.0001  max mem: 71357
+[05:43:20.139266] Epoch: [1]  [1460/6500]  lr: 0.000049  closs: 0.7676 (0.7636)  grad_norm: 0.4100 (0.5000)  time: 5.5749  data: 0.0001  max mem: 71357
+[05:44:15.852405] Epoch: [1]  [1470/6500]  lr: 0.000049  closs: 0.8084 (0.7637)  grad_norm: 0.4047 (0.4994)  time: 5.5721  data: 0.0001  max mem: 71357
+[05:45:11.499402] Epoch: [1]  [1480/6500]  lr: 0.000049  closs: 0.7897 (0.7633)  grad_norm: 0.4047 (0.4993)  time: 5.5679  data: 0.0001  max mem: 71357
+[05:46:07.251883] Epoch: [1]  [1490/6500]  lr: 0.000049  closs: 0.7280 (0.7636)  grad_norm: 0.3998 (0.4990)  time: 5.5699  data: 0.0001  max mem: 71357
+[05:47:03.095336] Epoch: [1]  [1500/6500]  lr: 0.000049  closs: 0.7501 (0.7636)  grad_norm: 0.3807 (0.4985)  time: 5.5797  data: 0.0001  max mem: 71357
+[05:47:58.748278] Epoch: [1]  [1510/6500]  lr: 0.000049  closs: 0.7772 (0.7638)  grad_norm: 0.3998 (0.4986)  time: 5.5747  data: 0.0001  max mem: 71357
+[05:48:54.481750] Epoch: [1]  [1520/6500]  lr: 0.000049  closs: 0.7816 (0.7640)  grad_norm: 0.3807 (0.4983)  time: 5.5692  data: 0.0002  max mem: 71357
+[05:49:50.180751] Epoch: [1]  [1530/6500]  lr: 0.000049  closs: 0.7623 (0.7641)  grad_norm: 0.3790 (0.4979)  time: 5.5715  data: 0.0002  max mem: 71357
+[05:50:45.971116] Epoch: [1]  [1540/6500]  lr: 0.000049  closs: 0.8359 (0.7645)  grad_norm: 0.3935 (0.4972)  time: 5.5743  data: 0.0001  max mem: 71357
+[05:51:41.648187] Epoch: [1]  [1550/6500]  lr: 0.000049  closs: 0.7772 (0.7649)  grad_norm: 0.3974 (0.4970)  time: 5.5733  data: 0.0001  max mem: 71357
+[05:52:37.329020] Epoch: [1]  [1560/6500]  lr: 0.000049  closs: 0.7288 (0.7646)  grad_norm: 0.4314 (0.4972)  time: 5.5678  data: 0.0001  max mem: 71357
+[05:53:33.021624] Epoch: [1]  [1570/6500]  lr: 0.000049  closs: 0.7566 (0.7647)  grad_norm: 0.4203 (0.4978)  time: 5.5686  data: 0.0001  max mem: 71357
+[05:54:28.849243] Epoch: [1]  [1580/6500]  lr: 0.000049  closs: 0.7969 (0.7649)  grad_norm: 0.4503 (0.4972)  time: 5.5759  data: 0.0001  max mem: 71357
+[05:55:24.622675] Epoch: [1]  [1590/6500]  lr: 0.000049  closs: 0.7610 (0.7645)  grad_norm: 0.4014 (0.4963)  time: 5.5800  data: 0.0001  max mem: 71357
+[05:56:20.321965] Epoch: [1]  [1600/6500]  lr: 0.000049  closs: 0.7300 (0.7644)  grad_norm: 0.3717 (0.4956)  time: 5.5735  data: 0.0001  max mem: 71357
+[05:57:16.028460] Epoch: [1]  [1610/6500]  lr: 0.000049  closs: 0.7300 (0.7642)  grad_norm: 0.3510 (0.4955)  time: 5.5702  data: 0.0001  max mem: 71357
+[05:58:11.783706] Epoch: [1]  [1620/6500]  lr: 0.000049  closs: 0.7731 (0.7643)  grad_norm: 0.3510 (0.4947)  time: 5.5730  data: 0.0001  max mem: 71357
+[05:59:07.570790] Epoch: [1]  [1630/6500]  lr: 0.000049  closs: 0.8139 (0.7647)  grad_norm: 0.4063 (0.4945)  time: 5.5770  data: 0.0001  max mem: 71357
+[06:00:03.238802] Epoch: [1]  [1640/6500]  lr: 0.000049  closs: 0.8139 (0.7646)  grad_norm: 0.4153 (0.4940)  time: 5.5727  data: 0.0001  max mem: 71357
+[06:00:58.977098] Epoch: [1]  [1650/6500]  lr: 0.000049  closs: 0.7872 (0.7648)  grad_norm: 0.4063 (0.4936)  time: 5.5702  data: 0.0001  max mem: 71357
+[06:01:54.788329] Epoch: [1]  [1660/6500]  lr: 0.000049  closs: 0.7872 (0.7650)  grad_norm: 0.4311 (0.4933)  time: 5.5774  data: 0.0001  max mem: 71357
+[06:02:50.593146] Epoch: [1]  [1670/6500]  lr: 0.000049  closs: 0.7356 (0.7648)  grad_norm: 0.4235 (0.4928)  time: 5.5807  data: 0.0001  max mem: 71357
+[06:03:46.216024] Epoch: [1]  [1680/6500]  lr: 0.000049  closs: 0.7429 (0.7648)  grad_norm: 0.4235 (0.4927)  time: 5.5713  data: 0.0002  max mem: 71357
+[06:04:41.979289] Epoch: [1]  [1690/6500]  lr: 0.000049  closs: 0.7711 (0.7647)  grad_norm: 0.4235 (0.4929)  time: 5.5692  data: 0.0002  max mem: 71357
+[06:05:37.689767] Epoch: [1]  [1700/6500]  lr: 0.000049  closs: 0.7552 (0.7647)  grad_norm: 0.4142 (0.4924)  time: 5.5736  data: 0.0001  max mem: 71357
+[06:06:33.493941] Epoch: [1]  [1710/6500]  lr: 0.000049  closs: 0.6966 (0.7640)  grad_norm: 0.3993 (0.4918)  time: 5.5756  data: 0.0001  max mem: 71357
+[06:07:29.339116] Epoch: [1]  [1720/6500]  lr: 0.000049  closs: 0.6799 (0.7640)  grad_norm: 0.3993 (0.4916)  time: 5.5824  data: 0.0001  max mem: 71357
+[06:08:25.102712] Epoch: [1]  [1730/6500]  lr: 0.000049  closs: 0.6988 (0.7640)  grad_norm: 0.3986 (0.4910)  time: 5.5803  data: 0.0001  max mem: 71357
+[06:09:20.832118] Epoch: [1]  [1740/6500]  lr: 0.000049  closs: 0.6702 (0.7637)  grad_norm: 0.3993 (0.4907)  time: 5.5746  data: 0.0001  max mem: 71357
+[06:10:16.553919] Epoch: [1]  [1750/6500]  lr: 0.000049  closs: 0.6695 (0.7633)  grad_norm: 0.4026 (0.4905)  time: 5.5725  data: 0.0001  max mem: 71357
+[06:11:12.343083] Epoch: [1]  [1760/6500]  lr: 0.000049  closs: 0.7046 (0.7632)  grad_norm: 0.3957 (0.4898)  time: 5.5755  data: 0.0001  max mem: 71357
+[06:12:08.075303] Epoch: [1]  [1770/6500]  lr: 0.000049  closs: 0.7100 (0.7634)  grad_norm: 0.3782 (0.4889)  time: 5.5760  data: 0.0001  max mem: 71357
+[06:13:03.741874] Epoch: [1]  [1780/6500]  lr: 0.000049  closs: 0.7462 (0.7635)  grad_norm: 0.3682 (0.4882)  time: 5.5698  data: 0.0001  max mem: 71357
+[06:13:59.467131] Epoch: [1]  [1790/6500]  lr: 0.000049  closs: 0.7743 (0.7633)  grad_norm: 0.3682 (0.4882)  time: 5.5695  data: 0.0002  max mem: 71357
+[06:14:55.302278] Epoch: [1]  [1800/6500]  lr: 0.000049  closs: 0.7203 (0.7629)  grad_norm: 0.3880 (0.4885)  time: 5.5779  data: 0.0002  max mem: 71357
+[06:15:51.032856] Epoch: [1]  [1810/6500]  lr: 0.000049  closs: 0.7254 (0.7630)  grad_norm: 0.4342 (0.4885)  time: 5.5782  data: 0.0001  max mem: 71357
+[06:16:46.808428] Epoch: [1]  [1820/6500]  lr: 0.000049  closs: 0.7463 (0.7628)  grad_norm: 0.4342 (0.4879)  time: 5.5752  data: 0.0001  max mem: 71357
+[06:17:42.566412] Epoch: [1]  [1830/6500]  lr: 0.000049  closs: 0.7301 (0.7628)  grad_norm: 0.4255 (0.4880)  time: 5.5766  data: 0.0001  max mem: 71357
+[06:18:38.312144] Epoch: [1]  [1840/6500]  lr: 0.000049  closs: 0.8045 (0.7632)  grad_norm: 0.3756 (0.4880)  time: 5.5751  data: 0.0001  max mem: 71357
+[06:19:34.202602] Epoch: [1]  [1850/6500]  lr: 0.000049  closs: 0.7885 (0.7632)  grad_norm: 0.3538 (0.4873)  time: 5.5817  data: 0.0001  max mem: 71357
+[06:20:29.987714] Epoch: [1]  [1860/6500]  lr: 0.000049  closs: 0.7457 (0.7629)  grad_norm: 0.3515 (0.4866)  time: 5.5837  data: 0.0001  max mem: 71357
+[06:21:25.707631] Epoch: [1]  [1870/6500]  lr: 0.000049  closs: 0.7457 (0.7628)  grad_norm: 0.3717 (0.4872)  time: 5.5752  data: 0.0001  max mem: 71357
+[06:22:21.508496] Epoch: [1]  [1880/6500]  lr: 0.000049  closs: 0.7003 (0.7626)  grad_norm: 0.3717 (0.4877)  time: 5.5759  data: 0.0001  max mem: 71357
+[06:23:17.323008] Epoch: [1]  [1890/6500]  lr: 0.000049  closs: 0.7003 (0.7624)  grad_norm: 0.3850 (0.4873)  time: 5.5806  data: 0.0001  max mem: 71357
+[06:24:13.089745] Epoch: [1]  [1900/6500]  lr: 0.000049  closs: 0.7211 (0.7625)  grad_norm: 0.4018 (0.4866)  time: 5.5790  data: 0.0001  max mem: 71357
+[06:25:08.735665] Epoch: [1]  [1910/6500]  lr: 0.000049  closs: 0.7094 (0.7621)  grad_norm: 0.4018 (0.4866)  time: 5.5705  data: 0.0001  max mem: 71357
+[06:26:04.431078] Epoch: [1]  [1920/6500]  lr: 0.000049  closs: 0.6812 (0.7620)  grad_norm: 0.4018 (0.4865)  time: 5.5669  data: 0.0001  max mem: 71357
+[06:27:00.205152] Epoch: [1]  [1930/6500]  lr: 0.000049  closs: 0.7006 (0.7619)  grad_norm: 0.4018 (0.4860)  time: 5.5734  data: 0.0001  max mem: 71357
+[06:27:56.086508] Epoch: [1]  [1940/6500]  lr: 0.000049  closs: 0.7977 (0.7623)  grad_norm: 0.3875 (0.4854)  time: 5.5827  data: 0.0001  max mem: 71357
+[06:28:51.724118] Epoch: [1]  [1950/6500]  lr: 0.000049  closs: 0.7787 (0.7624)  grad_norm: 0.3773 (0.4852)  time: 5.5759  data: 0.0001  max mem: 71357
+[06:29:47.487192] Epoch: [1]  [1960/6500]  lr: 0.000049  closs: 0.7331 (0.7623)  grad_norm: 0.3773 (0.4848)  time: 5.5699  data: 0.0001  max mem: 71357
+[06:30:43.261286] Epoch: [1]  [1970/6500]  lr: 0.000049  closs: 0.7788 (0.7628)  grad_norm: 0.3829 (0.4843)  time: 5.5768  data: 0.0001  max mem: 71357
+[06:31:38.991880] Epoch: [1]  [1980/6500]  lr: 0.000049  closs: 0.7698 (0.7625)  grad_norm: 0.3889 (0.4838)  time: 5.5751  data: 0.0001  max mem: 71357
+[06:32:34.722420] Epoch: [1]  [1990/6500]  lr: 0.000049  closs: 0.7517 (0.7626)  grad_norm: 0.3977 (0.4837)  time: 5.5730  data: 0.0001  max mem: 71357
+[06:33:30.426934] Epoch: [1]  [2000/6500]  lr: 0.000049  closs: 0.7535 (0.7625)  grad_norm: 0.3977 (0.4837)  time: 5.5717  data: 0.0001  max mem: 71357
+[06:34:26.157107] Epoch: [1]  [2010/6500]  lr: 0.000049  closs: 0.7385 (0.7624)  grad_norm: 0.3737 (0.4831)  time: 5.5717  data: 0.0001  max mem: 71357
+[06:35:21.974735] Epoch: [1]  [2020/6500]  lr: 0.000049  closs: 0.7578 (0.7625)  grad_norm: 0.4042 (0.4834)  time: 5.5773  data: 0.0001  max mem: 71357
+[06:36:17.629504] Epoch: [1]  [2030/6500]  lr: 0.000049  closs: 0.8125 (0.7628)  grad_norm: 0.3830 (0.4834)  time: 5.5735  data: 0.0001  max mem: 71357
+[06:37:13.410785] Epoch: [1]  [2040/6500]  lr: 0.000049  closs: 0.8006 (0.7627)  grad_norm: 0.3633 (0.4827)  time: 5.5717  data: 0.0001  max mem: 71357
+[06:38:09.181666] Epoch: [1]  [2050/6500]  lr: 0.000049  closs: 0.7954 (0.7626)  grad_norm: 0.3844 (0.4823)  time: 5.5775  data: 0.0001  max mem: 71357
+[06:39:04.900937] Epoch: [1]  [2060/6500]  lr: 0.000049  closs: 0.7756 (0.7627)  grad_norm: 0.3844 (0.4822)  time: 5.5744  data: 0.0001  max mem: 71357
+[06:40:00.725173] Epoch: [1]  [2070/6500]  lr: 0.000049  closs: 0.7406 (0.7625)  grad_norm: 0.3707 (0.4816)  time: 5.5771  data: 0.0001  max mem: 71357
+[06:40:56.513573] Epoch: [1]  [2080/6500]  lr: 0.000049  closs: 0.7405 (0.7626)  grad_norm: 0.4113 (0.4810)  time: 5.5806  data: 0.0001  max mem: 71357
+[06:41:52.319453] Epoch: [1]  [2090/6500]  lr: 0.000049  closs: 0.7236 (0.7623)  grad_norm: 0.3818 (0.4805)  time: 5.5796  data: 0.0001  max mem: 71357
+[06:42:48.076246] Epoch: [1]  [2100/6500]  lr: 0.000049  closs: 0.6560 (0.7622)  grad_norm: 0.3610 (0.4800)  time: 5.5781  data: 0.0001  max mem: 71357
+[06:43:43.875280] Epoch: [1]  [2110/6500]  lr: 0.000049  closs: 0.6950 (0.7623)  grad_norm: 0.3698 (0.4796)  time: 5.5777  data: 0.0001  max mem: 71357
+[06:44:39.638406] Epoch: [1]  [2120/6500]  lr: 0.000049  closs: 0.7491 (0.7621)  grad_norm: 0.3982 (0.4794)  time: 5.5780  data: 0.0001  max mem: 71357
+[06:45:35.343891] Epoch: [1]  [2130/6500]  lr: 0.000049  closs: 0.7413 (0.7621)  grad_norm: 0.4028 (0.4792)  time: 5.5733  data: 0.0001  max mem: 71357
+[06:46:31.017460] Epoch: [1]  [2140/6500]  lr: 0.000049  closs: 0.7413 (0.7618)  grad_norm: 0.4288 (0.4796)  time: 5.5688  data: 0.0001  max mem: 71357
+[06:47:26.831774] Epoch: [1]  [2150/6500]  lr: 0.000049  closs: 0.7122 (0.7617)  grad_norm: 0.4262 (0.4791)  time: 5.5743  data: 0.0001  max mem: 71357
+[06:48:22.566097] Epoch: [1]  [2160/6500]  lr: 0.000049  closs: 0.7195 (0.7616)  grad_norm: 0.3987 (0.4787)  time: 5.5774  data: 0.0001  max mem: 71357
+[06:49:18.229337] Epoch: [1]  [2170/6500]  lr: 0.000049  closs: 0.7297 (0.7615)  grad_norm: 0.3768 (0.4784)  time: 5.5698  data: 0.0001  max mem: 71357
+[06:50:13.979986] Epoch: [1]  [2180/6500]  lr: 0.000049  closs: 0.7356 (0.7615)  grad_norm: 0.3603 (0.4779)  time: 5.5706  data: 0.0001  max mem: 71357
+[06:51:09.697275] Epoch: [1]  [2190/6500]  lr: 0.000049  closs: 0.7507 (0.7619)  grad_norm: 0.3768 (0.4775)  time: 5.5733  data: 0.0001  max mem: 71357
+[06:52:05.550399] Epoch: [1]  [2200/6500]  lr: 0.000049  closs: 0.6988 (0.7616)  grad_norm: 0.3747 (0.4779)  time: 5.5784  data: 0.0001  max mem: 71357
+[06:53:01.293904] Epoch: [1]  [2210/6500]  lr: 0.000049  closs: 0.6853 (0.7614)  grad_norm: 0.3885 (0.4777)  time: 5.5797  data: 0.0001  max mem: 71357
+[06:53:56.969787] Epoch: [1]  [2220/6500]  lr: 0.000049  closs: 0.7125 (0.7612)  grad_norm: 0.4290 (0.4780)  time: 5.5709  data: 0.0002  max mem: 71357
+[06:54:52.673780] Epoch: [1]  [2230/6500]  lr: 0.000049  closs: 0.7342 (0.7614)  grad_norm: 0.4588 (0.4781)  time: 5.5689  data: 0.0002  max mem: 71357
+[06:55:48.376439] Epoch: [1]  [2240/6500]  lr: 0.000049  closs: 0.8235 (0.7617)  grad_norm: 0.4588 (0.4780)  time: 5.5702  data: 0.0001  max mem: 71357
+[06:56:44.026518] Epoch: [1]  [2250/6500]  lr: 0.000049  closs: 0.7678 (0.7616)  grad_norm: 0.4054 (0.4783)  time: 5.5675  data: 0.0001  max mem: 71357
+[06:57:39.613302] Epoch: [1]  [2260/6500]  lr: 0.000049  closs: 0.7678 (0.7619)  grad_norm: 0.4079 (0.4790)  time: 5.5618  data: 0.0001  max mem: 71357
+[06:58:35.230225] Epoch: [1]  [2270/6500]  lr: 0.000049  closs: 0.8237 (0.7621)  grad_norm: 0.4522 (0.4794)  time: 5.5601  data: 0.0001  max mem: 71357
+[06:59:30.944187] Epoch: [1]  [2280/6500]  lr: 0.000048  closs: 0.7225 (0.7620)  grad_norm: 0.4522 (0.4798)  time: 5.5664  data: 0.0001  max mem: 71357
+[07:00:26.770134] Epoch: [1]  [2290/6500]  lr: 0.000048  closs: 0.7159 (0.7618)  grad_norm: 0.4822 (0.4797)  time: 5.5769  data: 0.0001  max mem: 71357
+[07:01:22.490132] Epoch: [1]  [2300/6500]  lr: 0.000048  closs: 0.7169 (0.7616)  grad_norm: 0.4143 (0.4794)  time: 5.5772  data: 0.0001  max mem: 71357
+[07:02:18.117158] Epoch: [1]  [2310/6500]  lr: 0.000048  closs: 0.7632 (0.7617)  grad_norm: 0.3907 (0.4794)  time: 5.5673  data: 0.0001  max mem: 71357
+[07:03:13.846062] Epoch: [1]  [2320/6500]  lr: 0.000048  closs: 0.7486 (0.7616)  grad_norm: 0.3819 (0.4791)  time: 5.5677  data: 0.0001  max mem: 71357
+[07:04:09.641871] Epoch: [1]  [2330/6500]  lr: 0.000048  closs: 0.7391 (0.7615)  grad_norm: 0.4143 (0.4791)  time: 5.5762  data: 0.0001  max mem: 71357
+[07:05:05.451518] Epoch: [1]  [2340/6500]  lr: 0.000048  closs: 0.7884 (0.7618)  grad_norm: 0.3943 (0.4788)  time: 5.5802  data: 0.0001  max mem: 71357
+[07:06:01.130149] Epoch: [1]  [2350/6500]  lr: 0.000048  closs: 0.7346 (0.7614)  grad_norm: 0.4021 (0.4786)  time: 5.5743  data: 0.0001  max mem: 71357
+[07:06:56.845866] Epoch: [1]  [2360/6500]  lr: 0.000048  closs: 0.7227 (0.7618)  grad_norm: 0.4021 (0.4786)  time: 5.5696  data: 0.0001  max mem: 71357
+[07:07:52.563701] Epoch: [1]  [2370/6500]  lr: 0.000048  closs: 0.8105 (0.7620)  grad_norm: 0.3804 (0.4782)  time: 5.5716  data: 0.0001  max mem: 71357
+[07:08:48.395607] Epoch: [1]  [2380/6500]  lr: 0.000048  closs: 0.7941 (0.7621)  grad_norm: 0.4021 (0.4783)  time: 5.5774  data: 0.0001  max mem: 71357
+[07:09:44.149078] Epoch: [1]  [2390/6500]  lr: 0.000048  closs: 0.7885 (0.7619)  grad_norm: 0.3905 (0.4779)  time: 5.5792  data: 0.0001  max mem: 71357
+[07:10:39.820251] Epoch: [1]  [2400/6500]  lr: 0.000048  closs: 0.6213 (0.7615)  grad_norm: 0.4045 (0.4780)  time: 5.5712  data: 0.0001  max mem: 71357
+[07:11:35.486229] Epoch: [1]  [2410/6500]  lr: 0.000048  closs: 0.6479 (0.7614)  grad_norm: 0.3905 (0.4782)  time: 5.5668  data: 0.0001  max mem: 71357
+[07:12:31.320418] Epoch: [1]  [2420/6500]  lr: 0.000048  closs: 0.7154 (0.7611)  grad_norm: 0.3799 (0.4777)  time: 5.5749  data: 0.0001  max mem: 71357
+[07:13:27.010058] Epoch: [1]  [2430/6500]  lr: 0.000048  closs: 0.7631 (0.7612)  grad_norm: 0.3917 (0.4774)  time: 5.5761  data: 0.0001  max mem: 71357
+[07:14:22.706755] Epoch: [1]  [2440/6500]  lr: 0.000048  closs: 0.7958 (0.7615)  grad_norm: 0.3609 (0.4771)  time: 5.5692  data: 0.0001  max mem: 71357
+[07:15:18.472441] Epoch: [1]  [2450/6500]  lr: 0.000048  closs: 0.7683 (0.7613)  grad_norm: 0.3452 (0.4766)  time: 5.5730  data: 0.0001  max mem: 71357
+[07:16:14.358302] Epoch: [1]  [2460/6500]  lr: 0.000048  closs: 0.7683 (0.7612)  grad_norm: 0.3917 (0.4763)  time: 5.5825  data: 0.0001  max mem: 71357
+[07:17:09.997614] Epoch: [1]  [2470/6500]  lr: 0.000048  closs: 0.8213 (0.7615)  grad_norm: 0.3609 (0.4760)  time: 5.5761  data: 0.0001  max mem: 71357
+[07:18:05.674747] Epoch: [1]  [2480/6500]  lr: 0.000048  closs: 0.7941 (0.7614)  grad_norm: 0.4185 (0.4761)  time: 5.5657  data: 0.0001  max mem: 71357
+[07:19:01.392136] Epoch: [1]  [2490/6500]  lr: 0.000048  closs: 0.7147 (0.7614)  grad_norm: 0.4099 (0.4756)  time: 5.5696  data: 0.0002  max mem: 71357
+[07:19:57.106876] Epoch: [1]  [2500/6500]  lr: 0.000048  closs: 0.7326 (0.7611)  grad_norm: 0.3720 (0.4753)  time: 5.5715  data: 0.0002  max mem: 71357
+[07:20:52.874861] Epoch: [1]  [2510/6500]  lr: 0.000048  closs: 0.6783 (0.7609)  grad_norm: 0.3720 (0.4748)  time: 5.5740  data: 0.0001  max mem: 71357
+[07:21:48.643566] Epoch: [1]  [2520/6500]  lr: 0.000048  closs: 0.6970 (0.7610)  grad_norm: 0.3627 (0.4746)  time: 5.5767  data: 0.0001  max mem: 71357
+[07:22:44.328112] Epoch: [1]  [2530/6500]  lr: 0.000048  closs: 0.7447 (0.7608)  grad_norm: 0.3790 (0.4745)  time: 5.5726  data: 0.0001  max mem: 71357
+[07:23:40.104620] Epoch: [1]  [2540/6500]  lr: 0.000048  closs: 0.7475 (0.7607)  grad_norm: 0.3792 (0.4743)  time: 5.5730  data: 0.0001  max mem: 71357
+[07:24:35.986148] Epoch: [1]  [2550/6500]  lr: 0.000048  closs: 0.8081 (0.7609)  grad_norm: 0.3877 (0.4740)  time: 5.5828  data: 0.0001  max mem: 71357
+[07:25:31.762648] Epoch: [1]  [2560/6500]  lr: 0.000048  closs: 0.8194 (0.7610)  grad_norm: 0.4054 (0.4738)  time: 5.5828  data: 0.0001  max mem: 71357
+[07:26:27.517291] Epoch: [1]  [2570/6500]  lr: 0.000048  closs: 0.8143 (0.7612)  grad_norm: 0.4320 (0.4736)  time: 5.5765  data: 0.0001  max mem: 71357
+[07:27:23.081161] Epoch: [1]  [2580/6500]  lr: 0.000048  closs: 0.8143 (0.7614)  grad_norm: 0.4400 (0.4736)  time: 5.5658  data: 0.0001  max mem: 71357
+[07:28:18.847361] Epoch: [1]  [2590/6500]  lr: 0.000048  closs: 0.7732 (0.7615)  grad_norm: 0.4291 (0.4731)  time: 5.5664  data: 0.0001  max mem: 71357
+[07:29:14.631754] Epoch: [1]  [2600/6500]  lr: 0.000048  closs: 0.7687 (0.7615)  grad_norm: 0.4291 (0.4730)  time: 5.5774  data: 0.0001  max mem: 71357
+[07:30:10.313627] Epoch: [1]  [2610/6500]  lr: 0.000048  closs: 0.7136 (0.7613)  grad_norm: 0.4208 (0.4730)  time: 5.5732  data: 0.0001  max mem: 71357
+[07:31:06.069722] Epoch: [1]  [2620/6500]  lr: 0.000048  closs: 0.7577 (0.7615)  grad_norm: 0.3758 (0.4727)  time: 5.5718  data: 0.0001  max mem: 71357
+[07:32:01.904851] Epoch: [1]  [2630/6500]  lr: 0.000048  closs: 0.7331 (0.7612)  grad_norm: 0.3758 (0.4724)  time: 5.5795  data: 0.0001  max mem: 71357
+[07:32:57.687644] Epoch: [1]  [2640/6500]  lr: 0.000048  closs: 0.7558 (0.7615)  grad_norm: 0.3758 (0.4724)  time: 5.5808  data: 0.0001  max mem: 71357
+[07:33:53.438636] Epoch: [1]  [2650/6500]  lr: 0.000048  closs: 0.7639 (0.7615)  grad_norm: 0.3693 (0.4720)  time: 5.5765  data: 0.0001  max mem: 71357
+[07:34:49.158394] Epoch: [1]  [2660/6500]  lr: 0.000048  closs: 0.7320 (0.7611)  grad_norm: 0.3779 (0.4722)  time: 5.5734  data: 0.0001  max mem: 71357
+[07:35:44.955768] Epoch: [1]  [2670/6500]  lr: 0.000048  closs: 0.7320 (0.7612)  grad_norm: 0.3779 (0.4717)  time: 5.5758  data: 0.0001  max mem: 71357
+[07:36:40.729888] Epoch: [1]  [2680/6500]  lr: 0.000048  closs: 0.7680 (0.7613)  grad_norm: 0.3804 (0.4715)  time: 5.5785  data: 0.0001  max mem: 71357
+[07:37:36.512085] Epoch: [1]  [2690/6500]  lr: 0.000048  closs: 0.7978 (0.7615)  grad_norm: 0.4166 (0.4714)  time: 5.5777  data: 0.0001  max mem: 71357
+[07:38:32.202375] Epoch: [1]  [2700/6500]  lr: 0.000048  closs: 0.7734 (0.7615)  grad_norm: 0.3963 (0.4712)  time: 5.5735  data: 0.0001  max mem: 71357
+[07:39:27.865502] Epoch: [1]  [2710/6500]  lr: 0.000048  closs: 0.7563 (0.7617)  grad_norm: 0.4106 (0.4714)  time: 5.5675  data: 0.0002  max mem: 71357
+[07:40:23.614087] Epoch: [1]  [2720/6500]  lr: 0.000048  closs: 0.8125 (0.7619)  grad_norm: 0.3954 (0.4711)  time: 5.5705  data: 0.0002  max mem: 71357
+[07:41:19.483972] Epoch: [1]  [2730/6500]  lr: 0.000048  closs: 0.8166 (0.7619)  grad_norm: 0.3848 (0.4707)  time: 5.5808  data: 0.0001  max mem: 71357
+[07:42:15.165152] Epoch: [1]  [2740/6500]  lr: 0.000048  closs: 0.7304 (0.7619)  grad_norm: 0.3909 (0.4707)  time: 5.5775  data: 0.0001  max mem: 71357
+[07:43:10.903891] Epoch: [1]  [2750/6500]  lr: 0.000048  closs: 0.7304 (0.7617)  grad_norm: 0.3943 (0.4704)  time: 5.5709  data: 0.0001  max mem: 71357
+[07:44:06.612623] Epoch: [1]  [2760/6500]  lr: 0.000048  closs: 0.6582 (0.7613)  grad_norm: 0.3752 (0.4701)  time: 5.5723  data: 0.0002  max mem: 71357
+[07:45:02.480213] Epoch: [1]  [2770/6500]  lr: 0.000048  closs: 0.6582 (0.7612)  grad_norm: 0.3752 (0.4696)  time: 5.5787  data: 0.0002  max mem: 71357
+[07:45:58.196941] Epoch: [1]  [2780/6500]  lr: 0.000048  closs: 0.7057 (0.7612)  grad_norm: 0.3494 (0.4695)  time: 5.5791  data: 0.0001  max mem: 71357
+[07:46:53.877620] Epoch: [1]  [2790/6500]  lr: 0.000048  closs: 0.7393 (0.7613)  grad_norm: 0.3485 (0.4697)  time: 5.5698  data: 0.0001  max mem: 71357
+[07:47:49.527855] Epoch: [1]  [2800/6500]  lr: 0.000048  closs: 0.7912 (0.7615)  grad_norm: 0.3532 (0.4696)  time: 5.5664  data: 0.0001  max mem: 71357
+[07:48:45.322368] Epoch: [1]  [2810/6500]  lr: 0.000048  closs: 0.7889 (0.7616)  grad_norm: 0.3971 (0.4696)  time: 5.5721  data: 0.0001  max mem: 71357
+[07:49:41.244305] Epoch: [1]  [2820/6500]  lr: 0.000048  closs: 0.7473 (0.7618)  grad_norm: 0.4032 (0.4694)  time: 5.5857  data: 0.0002  max mem: 71357
+[07:50:37.037384] Epoch: [1]  [2830/6500]  lr: 0.000048  closs: 0.7493 (0.7616)  grad_norm: 0.3973 (0.4690)  time: 5.5857  data: 0.0002  max mem: 71357
+[07:51:32.889723] Epoch: [1]  [2840/6500]  lr: 0.000048  closs: 0.7518 (0.7616)  grad_norm: 0.3720 (0.4686)  time: 5.5822  data: 0.0001  max mem: 71357
+[07:52:28.686394] Epoch: [1]  [2850/6500]  lr: 0.000048  closs: 0.7385 (0.7615)  grad_norm: 0.3720 (0.4684)  time: 5.5823  data: 0.0001  max mem: 71357
+[07:53:24.563749] Epoch: [1]  [2860/6500]  lr: 0.000048  closs: 0.7385 (0.7615)  grad_norm: 0.3721 (0.4692)  time: 5.5836  data: 0.0001  max mem: 71357
+[07:54:20.254650] Epoch: [1]  [2870/6500]  lr: 0.000048  closs: 0.7785 (0.7615)  grad_norm: 0.3818 (0.4690)  time: 5.5783  data: 0.0002  max mem: 71357
+[07:55:15.940546] Epoch: [1]  [2880/6500]  lr: 0.000048  closs: 0.7861 (0.7616)  grad_norm: 0.3890 (0.4688)  time: 5.5687  data: 0.0002  max mem: 71357
+[07:56:11.671960] Epoch: [1]  [2890/6500]  lr: 0.000048  closs: 0.7440 (0.7616)  grad_norm: 0.4069 (0.4688)  time: 5.5708  data: 0.0001  max mem: 71357
+[07:57:07.459023] Epoch: [1]  [2900/6500]  lr: 0.000048  closs: 0.7440 (0.7617)  grad_norm: 0.3818 (0.4685)  time: 5.5759  data: 0.0001  max mem: 71357
+[07:58:03.203864] Epoch: [1]  [2910/6500]  lr: 0.000048  closs: 0.7739 (0.7616)  grad_norm: 0.3697 (0.4684)  time: 5.5765  data: 0.0001  max mem: 71357
+[07:58:58.923850] Epoch: [1]  [2920/6500]  lr: 0.000048  closs: 0.7745 (0.7616)  grad_norm: 0.3614 (0.4680)  time: 5.5731  data: 0.0001  max mem: 71357
+[07:59:54.665057] Epoch: [1]  [2930/6500]  lr: 0.000048  closs: 0.7605 (0.7615)  grad_norm: 0.3855 (0.4680)  time: 5.5729  data: 0.0001  max mem: 71357
+[08:00:50.398913] Epoch: [1]  [2940/6500]  lr: 0.000048  closs: 0.6680 (0.7614)  grad_norm: 0.3870 (0.4680)  time: 5.5737  data: 0.0001  max mem: 71357
+[08:01:46.151660] Epoch: [1]  [2950/6500]  lr: 0.000048  closs: 0.6983 (0.7613)  grad_norm: 0.4192 (0.4678)  time: 5.5743  data: 0.0001  max mem: 71357
+[08:02:41.874089] Epoch: [1]  [2960/6500]  lr: 0.000047  closs: 0.7362 (0.7612)  grad_norm: 0.4303 (0.4679)  time: 5.5737  data: 0.0001  max mem: 71357
+[08:03:37.531424] Epoch: [1]  [2970/6500]  lr: 0.000047  closs: 0.7607 (0.7614)  grad_norm: 0.4208 (0.4677)  time: 5.5689  data: 0.0001  max mem: 71357
+[08:04:33.308851] Epoch: [1]  [2980/6500]  lr: 0.000047  closs: 0.8016 (0.7617)  grad_norm: 0.4293 (0.4677)  time: 5.5716  data: 0.0002  max mem: 71357
+[08:05:29.081119] Epoch: [1]  [2990/6500]  lr: 0.000047  closs: 0.8433 (0.7620)  grad_norm: 0.4644 (0.4678)  time: 5.5774  data: 0.0002  max mem: 71357
+[08:06:24.770784] Epoch: [1]  [3000/6500]  lr: 0.000047  closs: 0.8140 (0.7621)  grad_norm: 0.4623 (0.4678)  time: 5.5730  data: 0.0001  max mem: 71357
+[08:07:20.456994] Epoch: [1]  [3010/6500]  lr: 0.000047  closs: 0.7919 (0.7624)  grad_norm: 0.4056 (0.4676)  time: 5.5687  data: 0.0001  max mem: 71357
+[08:08:16.206729] Epoch: [1]  [3020/6500]  lr: 0.000047  closs: 0.7767 (0.7623)  grad_norm: 0.3780 (0.4676)  time: 5.5717  data: 0.0001  max mem: 71357
+[08:09:11.944448] Epoch: [1]  [3030/6500]  lr: 0.000047  closs: 0.7767 (0.7623)  grad_norm: 0.3780 (0.4674)  time: 5.5743  data: 0.0002  max mem: 71357
+[08:10:07.833770] Epoch: [1]  [3040/6500]  lr: 0.000047  closs: 0.7765 (0.7625)  grad_norm: 0.3748 (0.4672)  time: 5.5812  data: 0.0002  max mem: 71357
+[08:11:03.571764] Epoch: [1]  [3050/6500]  lr: 0.000047  closs: 0.7737 (0.7626)  grad_norm: 0.4118 (0.4674)  time: 5.5812  data: 0.0001  max mem: 71357
+[08:11:59.228717] Epoch: [1]  [3060/6500]  lr: 0.000047  closs: 0.7259 (0.7627)  grad_norm: 0.4142 (0.4672)  time: 5.5697  data: 0.0001  max mem: 71357
+[08:12:54.997556] Epoch: [1]  [3070/6500]  lr: 0.000047  closs: 0.7579 (0.7627)  grad_norm: 0.4118 (0.4670)  time: 5.5712  data: 0.0001  max mem: 71357
+[08:13:50.769938] Epoch: [1]  [3080/6500]  lr: 0.000047  closs: 0.7579 (0.7629)  grad_norm: 0.4118 (0.4668)  time: 5.5770  data: 0.0001  max mem: 71357
+[08:14:46.551482] Epoch: [1]  [3090/6500]  lr: 0.000047  closs: 0.8032 (0.7630)  grad_norm: 0.3768 (0.4667)  time: 5.5776  data: 0.0001  max mem: 71357
+[08:15:42.253189] Epoch: [1]  [3100/6500]  lr: 0.000047  closs: 0.7503 (0.7631)  grad_norm: 0.3593 (0.4665)  time: 5.5740  data: 0.0001  max mem: 71357
+[08:16:37.935608] Epoch: [1]  [3110/6500]  lr: 0.000047  closs: 0.7480 (0.7632)  grad_norm: 0.3968 (0.4666)  time: 5.5691  data: 0.0001  max mem: 71357
+[08:17:33.663599] Epoch: [1]  [3120/6500]  lr: 0.000047  closs: 0.7927 (0.7635)  grad_norm: 0.4407 (0.4668)  time: 5.5704  data: 0.0001  max mem: 71357
+[08:18:29.345056] Epoch: [1]  [3130/6500]  lr: 0.000047  closs: 0.6941 (0.7632)  grad_norm: 0.3968 (0.4668)  time: 5.5704  data: 0.0001  max mem: 71357
+[08:19:24.975538] Epoch: [1]  [3140/6500]  lr: 0.000047  closs: 0.6547 (0.7629)  grad_norm: 0.4024 (0.4667)  time: 5.5655  data: 0.0002  max mem: 71357
+[08:20:20.672689] Epoch: [1]  [3150/6500]  lr: 0.000047  closs: 0.7241 (0.7629)  grad_norm: 0.4054 (0.4671)  time: 5.5663  data: 0.0002  max mem: 71357
+[08:21:16.505928] Epoch: [1]  [3160/6500]  lr: 0.000047  closs: 0.7440 (0.7630)  grad_norm: 0.3850 (0.4667)  time: 5.5764  data: 0.0001  max mem: 71357
+[08:22:12.375786] Epoch: [1]  [3170/6500]  lr: 0.000047  closs: 0.8047 (0.7632)  grad_norm: 0.4024 (0.4664)  time: 5.5850  data: 0.0001  max mem: 71357
+[08:23:08.092208] Epoch: [1]  [3180/6500]  lr: 0.000047  closs: 0.7895 (0.7631)  grad_norm: 0.4011 (0.4664)  time: 5.5792  data: 0.0001  max mem: 71357
+[08:24:03.827649] Epoch: [1]  [3190/6500]  lr: 0.000047  closs: 0.7448 (0.7630)  grad_norm: 0.4006 (0.4662)  time: 5.5725  data: 0.0001  max mem: 71357
+[08:24:59.532202] Epoch: [1]  [3200/6500]  lr: 0.000047  closs: 0.7496 (0.7631)  grad_norm: 0.4011 (0.4661)  time: 5.5719  data: 0.0001  max mem: 71357
+[08:25:55.397211] Epoch: [1]  [3210/6500]  lr: 0.000047  closs: 0.7620 (0.7631)  grad_norm: 0.4011 (0.4659)  time: 5.5784  data: 0.0001  max mem: 71357
+[08:26:51.117745] Epoch: [1]  [3220/6500]  lr: 0.000047  closs: 0.7429 (0.7630)  grad_norm: 0.4174 (0.4658)  time: 5.5791  data: 0.0001  max mem: 71357
+[08:27:46.826662] Epoch: [1]  [3230/6500]  lr: 0.000047  closs: 0.6742 (0.7628)  grad_norm: 0.4086 (0.4658)  time: 5.5713  data: 0.0001  max mem: 71357
+[08:28:42.685834] Epoch: [1]  [3240/6500]  lr: 0.000047  closs: 0.6707 (0.7625)  grad_norm: 0.4086 (0.4656)  time: 5.5783  data: 0.0001  max mem: 71357
+[08:29:38.511456] Epoch: [1]  [3250/6500]  lr: 0.000047  closs: 0.6844 (0.7624)  grad_norm: 0.3959 (0.4653)  time: 5.5842  data: 0.0001  max mem: 71357
+[08:30:34.409341] Epoch: [1]  [3260/6500]  lr: 0.000047  closs: 0.7046 (0.7623)  grad_norm: 0.3587 (0.4650)  time: 5.5861  data: 0.0001  max mem: 71357
+[08:31:30.133837] Epoch: [1]  [3270/6500]  lr: 0.000047  closs: 0.7094 (0.7624)  grad_norm: 0.3697 (0.4648)  time: 5.5810  data: 0.0001  max mem: 71357
+[08:32:25.764744] Epoch: [1]  [3280/6500]  lr: 0.000047  closs: 0.8080 (0.7624)  grad_norm: 0.3697 (0.4645)  time: 5.5677  data: 0.0001  max mem: 71357
+[08:33:21.575850] Epoch: [1]  [3290/6500]  lr: 0.000047  closs: 0.7767 (0.7625)  grad_norm: 0.3703 (0.4643)  time: 5.5720  data: 0.0001  max mem: 71357
+[08:34:17.542015] Epoch: [1]  [3300/6500]  lr: 0.000047  closs: 0.7996 (0.7626)  grad_norm: 0.3703 (0.4639)  time: 5.5888  data: 0.0001  max mem: 71357
+[08:35:13.335207] Epoch: [1]  [3310/6500]  lr: 0.000047  closs: 0.8321 (0.7627)  grad_norm: 0.3927 (0.4637)  time: 5.5879  data: 0.0001  max mem: 71357
+[08:36:09.139010] Epoch: [1]  [3320/6500]  lr: 0.000047  closs: 0.8484 (0.7629)  grad_norm: 0.3927 (0.4638)  time: 5.5797  data: 0.0001  max mem: 71357
+[08:37:04.835975] Epoch: [1]  [3330/6500]  lr: 0.000047  closs: 0.7901 (0.7628)  grad_norm: 0.4108 (0.4642)  time: 5.5749  data: 0.0001  max mem: 71357
+[08:38:00.607556] Epoch: [1]  [3340/6500]  lr: 0.000047  closs: 0.7757 (0.7629)  grad_norm: 0.4291 (0.4640)  time: 5.5733  data: 0.0001  max mem: 71357
+[08:38:56.311132] Epoch: [1]  [3350/6500]  lr: 0.000047  closs: 0.7757 (0.7630)  grad_norm: 0.4346 (0.4640)  time: 5.5737  data: 0.0001  max mem: 71357
+[08:39:52.026895] Epoch: [1]  [3360/6500]  lr: 0.000047  closs: 0.7453 (0.7628)  grad_norm: 0.4087 (0.4638)  time: 5.5709  data: 0.0001  max mem: 71357
+[08:40:47.717482] Epoch: [1]  [3370/6500]  lr: 0.000047  closs: 0.7354 (0.7627)  grad_norm: 0.3643 (0.4664)  time: 5.5702  data: 0.0001  max mem: 71357
+[08:41:43.520242] Epoch: [1]  [3380/6500]  lr: 0.000047  closs: 0.7594 (0.7628)  grad_norm: 0.4010 (0.4665)  time: 5.5745  data: 0.0001  max mem: 71357
+[08:42:39.268585] Epoch: [1]  [3390/6500]  lr: 0.000047  closs: 0.7647 (0.7628)  grad_norm: 0.4010 (0.4664)  time: 5.5775  data: 0.0001  max mem: 71357
+[08:43:35.001439] Epoch: [1]  [3400/6500]  lr: 0.000047  closs: 0.7647 (0.7628)  grad_norm: 0.3704 (0.4664)  time: 5.5740  data: 0.0001  max mem: 71357
+[08:44:30.743501] Epoch: [1]  [3410/6500]  lr: 0.000047  closs: 0.7684 (0.7630)  grad_norm: 0.3704 (0.4662)  time: 5.5737  data: 0.0002  max mem: 71357
+[08:45:26.393462] Epoch: [1]  [3420/6500]  lr: 0.000047  closs: 0.7215 (0.7627)  grad_norm: 0.3796 (0.4673)  time: 5.5695  data: 0.0002  max mem: 71357
+[08:46:22.210157] Epoch: [1]  [3430/6500]  lr: 0.000047  closs: 0.6823 (0.7628)  grad_norm: 0.4108 (0.4672)  time: 5.5732  data: 0.0001  max mem: 71357
+[08:47:17.932445] Epoch: [1]  [3440/6500]  lr: 0.000047  closs: 0.7888 (0.7629)  grad_norm: 0.4108 (0.4673)  time: 5.5768  data: 0.0001  max mem: 71357
+[08:48:13.684592] Epoch: [1]  [3450/6500]  lr: 0.000047  closs: 0.7483 (0.7628)  grad_norm: 0.4411 (0.4673)  time: 5.5736  data: 0.0001  max mem: 71357
+[08:49:09.395292] Epoch: [1]  [3460/6500]  lr: 0.000047  closs: 0.7008 (0.7626)  grad_norm: 0.4411 (0.4672)  time: 5.5731  data: 0.0001  max mem: 71357
+[08:50:05.184879] Epoch: [1]  [3470/6500]  lr: 0.000047  closs: 0.6853 (0.7625)  grad_norm: 0.4585 (0.4672)  time: 5.5749  data: 0.0001  max mem: 71357
+[08:51:01.025092] Epoch: [1]  [3480/6500]  lr: 0.000047  closs: 0.7112 (0.7624)  grad_norm: 0.4481 (0.4671)  time: 5.5814  data: 0.0001  max mem: 71357
+[08:51:56.784945] Epoch: [1]  [3490/6500]  lr: 0.000047  closs: 0.7441 (0.7625)  grad_norm: 0.4319 (0.4668)  time: 5.5799  data: 0.0001  max mem: 71357
+[08:52:52.516000] Epoch: [1]  [3500/6500]  lr: 0.000047  closs: 0.7127 (0.7623)  grad_norm: 0.4192 (0.4667)  time: 5.5744  data: 0.0001  max mem: 71357
+[08:53:48.323634] Epoch: [1]  [3510/6500]  lr: 0.000046  closs: 0.6960 (0.7622)  grad_norm: 0.3713 (0.4664)  time: 5.5768  data: 0.0001  max mem: 71357
+[08:54:44.172207] Epoch: [1]  [3520/6500]  lr: 0.000046  closs: 0.7526 (0.7620)  grad_norm: 0.3674 (0.4662)  time: 5.5827  data: 0.0001  max mem: 71357
+[08:55:39.982733] Epoch: [1]  [3530/6500]  lr: 0.000046  closs: 0.7364 (0.7620)  grad_norm: 0.3693 (0.4663)  time: 5.5829  data: 0.0001  max mem: 71357
+[08:56:35.769447] Epoch: [1]  [3540/6500]  lr: 0.000046  closs: 0.7020 (0.7619)  grad_norm: 0.3710 (0.4665)  time: 5.5798  data: 0.0001  max mem: 71357
+[08:57:31.531558] Epoch: [1]  [3550/6500]  lr: 0.000046  closs: 0.7080 (0.7618)  grad_norm: 0.3710 (0.4661)  time: 5.5773  data: 0.0001  max mem: 71357
+[08:58:27.330798] Epoch: [1]  [3560/6500]  lr: 0.000046  closs: 0.7660 (0.7619)  grad_norm: 0.4092 (0.4664)  time: 5.5780  data: 0.0001  max mem: 71357
+[08:59:23.008683] Epoch: [1]  [3570/6500]  lr: 0.000046  closs: 0.7951 (0.7619)  grad_norm: 0.4092 (0.4664)  time: 5.5738  data: 0.0002  max mem: 71357
+[09:00:18.802643] Epoch: [1]  [3580/6500]  lr: 0.000046  closs: 0.8291 (0.7622)  grad_norm: 0.4014 (0.4662)  time: 5.5735  data: 0.0002  max mem: 71357
+[09:01:14.550611] Epoch: [1]  [3590/6500]  lr: 0.000046  closs: 0.8291 (0.7623)  grad_norm: 0.4089 (0.4660)  time: 5.5770  data: 0.0001  max mem: 71357
+[09:02:10.271773] Epoch: [1]  [3600/6500]  lr: 0.000046  closs: 0.7837 (0.7624)  grad_norm: 0.4011 (0.4662)  time: 5.5734  data: 0.0001  max mem: 71357
+[09:03:06.083973] Epoch: [1]  [3610/6500]  lr: 0.000046  closs: 0.7482 (0.7622)  grad_norm: 0.3935 (0.4660)  time: 5.5765  data: 0.0001  max mem: 71357
+[09:04:01.836078] Epoch: [1]  [3620/6500]  lr: 0.000046  closs: 0.6927 (0.7623)  grad_norm: 0.3726 (0.4658)  time: 5.5781  data: 0.0001  max mem: 71357
+[09:04:57.478678] Epoch: [1]  [3630/6500]  lr: 0.000046  closs: 0.6874 (0.7621)  grad_norm: 0.3935 (0.4657)  time: 5.5697  data: 0.0001  max mem: 71357
+[09:05:53.278864] Epoch: [1]  [3640/6500]  lr: 0.000046  closs: 0.7235 (0.7621)  grad_norm: 0.3875 (0.4655)  time: 5.5721  data: 0.0001  max mem: 71357
+[09:06:48.995421] Epoch: [1]  [3650/6500]  lr: 0.000046  closs: 0.7066 (0.7619)  grad_norm: 0.3875 (0.4654)  time: 5.5757  data: 0.0001  max mem: 71357
+[09:07:44.777569] Epoch: [1]  [3660/6500]  lr: 0.000046  closs: 0.6981 (0.7619)  grad_norm: 0.3875 (0.4651)  time: 5.5748  data: 0.0001  max mem: 71357
+[09:08:40.434566] Epoch: [1]  [3670/6500]  lr: 0.000046  closs: 0.7989 (0.7620)  grad_norm: 0.3847 (0.4649)  time: 5.5718  data: 0.0001  max mem: 71357
+[09:09:36.095219] Epoch: [1]  [3680/6500]  lr: 0.000046  closs: 0.7827 (0.7620)  grad_norm: 0.3847 (0.4649)  time: 5.5658  data: 0.0002  max mem: 71357
+[09:10:31.882262] Epoch: [1]  [3690/6500]  lr: 0.000046  closs: 0.7264 (0.7618)  grad_norm: 0.3847 (0.4648)  time: 5.5723  data: 0.0002  max mem: 71357
+[09:11:27.755027] Epoch: [1]  [3700/6500]  lr: 0.000046  closs: 0.7427 (0.7620)  grad_norm: 0.3847 (0.4646)  time: 5.5829  data: 0.0001  max mem: 71357
+[09:12:23.464314] Epoch: [1]  [3710/6500]  lr: 0.000046  closs: 0.7427 (0.7619)  grad_norm: 0.3666 (0.4645)  time: 5.5790  data: 0.0001  max mem: 71357
+[09:13:19.156115] Epoch: [1]  [3720/6500]  lr: 0.000046  closs: 0.6591 (0.7618)  grad_norm: 0.3587 (0.4642)  time: 5.5700  data: 0.0001  max mem: 71357
+[09:14:14.876345] Epoch: [1]  [3730/6500]  lr: 0.000046  closs: 0.6957 (0.7617)  grad_norm: 0.3575 (0.4639)  time: 5.5705  data: 0.0001  max mem: 71357
+[09:15:10.762487] Epoch: [1]  [3740/6500]  lr: 0.000046  closs: 0.7069 (0.7615)  grad_norm: 0.3504 (0.4636)  time: 5.5802  data: 0.0001  max mem: 71357
+[09:16:06.591376] Epoch: [1]  [3750/6500]  lr: 0.000046  closs: 0.7085 (0.7616)  grad_norm: 0.3295 (0.4632)  time: 5.5856  data: 0.0001  max mem: 71357
+[09:17:02.377046] Epoch: [1]  [3760/6500]  lr: 0.000046  closs: 0.8005 (0.7617)  grad_norm: 0.3338 (0.4631)  time: 5.5806  data: 0.0001  max mem: 71357
+[09:17:58.234451] Epoch: [1]  [3770/6500]  lr: 0.000046  closs: 0.7918 (0.7618)  grad_norm: 0.3461 (0.4631)  time: 5.5821  data: 0.0001  max mem: 71357
+[09:18:54.023163] Epoch: [1]  [3780/6500]  lr: 0.000046  closs: 0.7503 (0.7619)  grad_norm: 0.3461 (0.4628)  time: 5.5822  data: 0.0001  max mem: 71357
+[09:19:49.736021] Epoch: [1]  [3790/6500]  lr: 0.000046  closs: 0.7219 (0.7617)  grad_norm: 0.3838 (0.4628)  time: 5.5750  data: 0.0002  max mem: 71357
+[09:20:45.437282] Epoch: [1]  [3800/6500]  lr: 0.000046  closs: 0.7728 (0.7621)  grad_norm: 0.3838 (0.4625)  time: 5.5706  data: 0.0002  max mem: 71357
+[09:21:41.168373] Epoch: [1]  [3810/6500]  lr: 0.000046  closs: 0.8062 (0.7621)  grad_norm: 0.3776 (0.4627)  time: 5.5715  data: 0.0001  max mem: 71357
+[09:22:36.975415] Epoch: [1]  [3820/6500]  lr: 0.000046  closs: 0.7432 (0.7622)  grad_norm: 0.3782 (0.4624)  time: 5.5768  data: 0.0001  max mem: 71357
+[09:23:32.914290] Epoch: [1]  [3830/6500]  lr: 0.000046  closs: 0.7504 (0.7620)  grad_norm: 0.3533 (0.4622)  time: 5.5872  data: 0.0001  max mem: 71357
+[09:24:28.594812] Epoch: [1]  [3840/6500]  lr: 0.000046  closs: 0.7185 (0.7618)  grad_norm: 0.3782 (0.4622)  time: 5.5809  data: 0.0002  max mem: 71357
+[09:25:24.364082] Epoch: [1]  [3850/6500]  lr: 0.000046  closs: 0.7185 (0.7618)  grad_norm: 0.3533 (0.4620)  time: 5.5724  data: 0.0002  max mem: 71357
+[09:26:20.100663] Epoch: [1]  [3860/6500]  lr: 0.000046  closs: 0.7267 (0.7616)  grad_norm: 0.3744 (0.4618)  time: 5.5752  data: 0.0001  max mem: 71357
+[09:27:15.928339] Epoch: [1]  [3870/6500]  lr: 0.000046  closs: 0.7195 (0.7616)  grad_norm: 0.3995 (0.4619)  time: 5.5781  data: 0.0001  max mem: 71357
+[09:28:11.670865] Epoch: [1]  [3880/6500]  lr: 0.000046  closs: 0.7310 (0.7615)  grad_norm: 0.3729 (0.4618)  time: 5.5784  data: 0.0001  max mem: 71357
+[09:29:07.413787] Epoch: [1]  [3890/6500]  lr: 0.000046  closs: 0.7109 (0.7613)  grad_norm: 0.3729 (0.4616)  time: 5.5742  data: 0.0001  max mem: 71357
+[09:30:03.241491] Epoch: [1]  [3900/6500]  lr: 0.000046  closs: 0.7159 (0.7613)  grad_norm: 0.3739 (0.4617)  time: 5.5784  data: 0.0001  max mem: 71357
+[09:30:59.019159] Epoch: [1]  [3910/6500]  lr: 0.000046  closs: 0.7093 (0.7612)  grad_norm: 0.3819 (0.4618)  time: 5.5802  data: 0.0001  max mem: 71357
+[09:31:54.797486] Epoch: [1]  [3920/6500]  lr: 0.000046  closs: 0.6981 (0.7612)  grad_norm: 0.4453 (0.4617)  time: 5.5777  data: 0.0001  max mem: 71357
+[09:32:50.515822] Epoch: [1]  [3930/6500]  lr: 0.000046  closs: 0.7891 (0.7612)  grad_norm: 0.4498 (0.4617)  time: 5.5747  data: 0.0001  max mem: 71357
+[09:33:46.285993] Epoch: [1]  [3940/6500]  lr: 0.000046  closs: 0.8399 (0.7613)  grad_norm: 0.4643 (0.4619)  time: 5.5743  data: 0.0001  max mem: 71357
+[09:34:42.060196] Epoch: [1]  [3950/6500]  lr: 0.000046  closs: 0.7596 (0.7614)  grad_norm: 0.4053 (0.4616)  time: 5.5771  data: 0.0002  max mem: 71357
+[09:35:37.892560] Epoch: [1]  [3960/6500]  lr: 0.000046  closs: 0.7154 (0.7614)  grad_norm: 0.4053 (0.4618)  time: 5.5802  data: 0.0002  max mem: 71357
+[09:36:33.555038] Epoch: [1]  [3970/6500]  lr: 0.000046  closs: 0.7729 (0.7614)  grad_norm: 0.3912 (0.4616)  time: 5.5746  data: 0.0001  max mem: 71357
+[09:37:29.253407] Epoch: [1]  [3980/6500]  lr: 0.000046  closs: 0.7599 (0.7612)  grad_norm: 0.3912 (0.4615)  time: 5.5679  data: 0.0001  max mem: 71357
+[09:38:25.028340] Epoch: [1]  [3990/6500]  lr: 0.000046  closs: 0.7088 (0.7613)  grad_norm: 0.3892 (0.4614)  time: 5.5735  data: 0.0001  max mem: 71357
+[09:39:20.898508] Epoch: [1]  [4000/6500]  lr: 0.000045  closs: 0.7781 (0.7614)  grad_norm: 0.3892 (0.4614)  time: 5.5821  data: 0.0001  max mem: 71357
+[09:40:16.589778] Epoch: [1]  [4010/6500]  lr: 0.000045  closs: 0.7052 (0.7612)  grad_norm: 0.3709 (0.4611)  time: 5.5780  data: 0.0001  max mem: 71357
+[09:41:12.368366] Epoch: [1]  [4020/6500]  lr: 0.000045  closs: 0.7052 (0.7612)  grad_norm: 0.3709 (0.4611)  time: 5.5734  data: 0.0001  max mem: 71357
+[09:42:08.075783] Epoch: [1]  [4030/6500]  lr: 0.000045  closs: 0.7410 (0.7613)  grad_norm: 0.3698 (0.4610)  time: 5.5742  data: 0.0001  max mem: 71357
+[09:43:03.790892] Epoch: [1]  [4040/6500]  lr: 0.000045  closs: 0.7635 (0.7613)  grad_norm: 0.3481 (0.4610)  time: 5.5711  data: 0.0001  max mem: 71357
+[09:43:59.611304] Epoch: [1]  [4050/6500]  lr: 0.000045  closs: 0.7848 (0.7613)  grad_norm: 0.3679 (0.4608)  time: 5.5767  data: 0.0001  max mem: 71357
+[09:44:55.346989] Epoch: [1]  [4060/6500]  lr: 0.000045  closs: 0.8193 (0.7616)  grad_norm: 0.3679 (0.4607)  time: 5.5777  data: 0.0002  max mem: 71357
+[09:45:51.144585] Epoch: [1]  [4070/6500]  lr: 0.000045  closs: 0.8203 (0.7615)  grad_norm: 0.3481 (0.4605)  time: 5.5765  data: 0.0002  max mem: 71357
+[09:46:46.827827] Epoch: [1]  [4080/6500]  lr: 0.000045  closs: 0.7796 (0.7615)  grad_norm: 0.3621 (0.4605)  time: 5.5739  data: 0.0001  max mem: 71357
+[09:47:42.580540] Epoch: [1]  [4090/6500]  lr: 0.000045  closs: 0.7967 (0.7616)  grad_norm: 0.3748 (0.4606)  time: 5.5716  data: 0.0001  max mem: 71357
+[09:48:38.297486] Epoch: [1]  [4100/6500]  lr: 0.000045  closs: 0.7621 (0.7619)  grad_norm: 0.4406 (0.4609)  time: 5.5733  data: 0.0001  max mem: 71357
+[09:49:33.991922] Epoch: [1]  [4110/6500]  lr: 0.000045  closs: 0.7621 (0.7618)  grad_norm: 0.5227 (0.4609)  time: 5.5705  data: 0.0002  max mem: 71357
+[09:50:29.739373] Epoch: [1]  [4120/6500]  lr: 0.000045  closs: 0.6884 (0.7617)  grad_norm: 0.4406 (0.4607)  time: 5.5720  data: 0.0002  max mem: 71357
+[09:51:25.522880] Epoch: [1]  [4130/6500]  lr: 0.000045  closs: 0.7258 (0.7617)  grad_norm: 0.4095 (0.4607)  time: 5.5764  data: 0.0001  max mem: 71357
+[09:52:21.422843] Epoch: [1]  [4140/6500]  lr: 0.000045  closs: 0.7362 (0.7617)  grad_norm: 0.3938 (0.4605)  time: 5.5841  data: 0.0001  max mem: 71357
+[09:53:17.172143] Epoch: [1]  [4150/6500]  lr: 0.000045  closs: 0.7308 (0.7617)  grad_norm: 0.3742 (0.4604)  time: 5.5824  data: 0.0001  max mem: 71357
+[09:54:12.914122] Epoch: [1]  [4160/6500]  lr: 0.000045  closs: 0.7515 (0.7616)  grad_norm: 0.3742 (0.4603)  time: 5.5744  data: 0.0001  max mem: 71357
+[09:55:08.625285] Epoch: [1]  [4170/6500]  lr: 0.000045  closs: 0.7577 (0.7616)  grad_norm: 0.3742 (0.4602)  time: 5.5725  data: 0.0001  max mem: 71357
+[09:56:04.404235] Epoch: [1]  [4180/6500]  lr: 0.000045  closs: 0.7953 (0.7617)  grad_norm: 0.3742 (0.4600)  time: 5.5744  data: 0.0001  max mem: 71357
+[09:57:00.149637] Epoch: [1]  [4190/6500]  lr: 0.000045  closs: 0.7425 (0.7615)  grad_norm: 0.3742 (0.4598)  time: 5.5761  data: 0.0001  max mem: 71357
+[09:57:55.891705] Epoch: [1]  [4200/6500]  lr: 0.000045  closs: 0.7307 (0.7614)  grad_norm: 0.3675 (0.4596)  time: 5.5743  data: 0.0001  max mem: 71357
+[09:58:51.703348] Epoch: [1]  [4210/6500]  lr: 0.000045  closs: 0.7554 (0.7616)  grad_norm: 0.3542 (0.4596)  time: 5.5776  data: 0.0001  max mem: 71357
+[09:59:47.523751] Epoch: [1]  [4220/6500]  lr: 0.000045  closs: 0.7977 (0.7617)  grad_norm: 0.3714 (0.4594)  time: 5.5815  data: 0.0001  max mem: 71357
+[10:00:43.137897] Epoch: [1]  [4230/6500]  lr: 0.000045  closs: 0.7750 (0.7617)  grad_norm: 0.3772 (0.4597)  time: 5.5716  data: 0.0001  max mem: 71357
+[10:01:38.861888] Epoch: [1]  [4240/6500]  lr: 0.000045  closs: 0.7175 (0.7616)  grad_norm: 0.3714 (0.4594)  time: 5.5668  data: 0.0001  max mem: 71357
+[10:02:34.575388] Epoch: [1]  [4250/6500]  lr: 0.000045  closs: 0.7552 (0.7615)  grad_norm: 0.3486 (0.4591)  time: 5.5718  data: 0.0001  max mem: 71357
+[10:03:30.284625] Epoch: [1]  [4260/6500]  lr: 0.000045  closs: 0.8085 (0.7617)  grad_norm: 0.3462 (0.4591)  time: 5.5710  data: 0.0001  max mem: 71357
+[10:04:26.115209] Epoch: [1]  [4270/6500]  lr: 0.000045  closs: 0.7622 (0.7617)  grad_norm: 0.3462 (0.4591)  time: 5.5769  data: 0.0001  max mem: 71357
+[10:05:21.870075] Epoch: [1]  [4280/6500]  lr: 0.000045  closs: 0.7565 (0.7615)  grad_norm: 0.3631 (0.4589)  time: 5.5792  data: 0.0001  max mem: 71357
+[10:06:17.586494] Epoch: [1]  [4290/6500]  lr: 0.000045  closs: 0.7211 (0.7615)  grad_norm: 0.3673 (0.4587)  time: 5.5734  data: 0.0001  max mem: 71357
+[10:07:13.372539] Epoch: [1]  [4300/6500]  lr: 0.000045  closs: 0.7198 (0.7613)  grad_norm: 0.3553 (0.4586)  time: 5.5750  data: 0.0001  max mem: 71357
+[10:08:09.184606] Epoch: [1]  [4310/6500]  lr: 0.000045  closs: 0.7264 (0.7612)  grad_norm: 0.3453 (0.4589)  time: 5.5798  data: 0.0001  max mem: 71357
+[10:09:04.953272] Epoch: [1]  [4320/6500]  lr: 0.000045  closs: 0.7640 (0.7613)  grad_norm: 0.3516 (0.4588)  time: 5.5789  data: 0.0001  max mem: 71357
+[10:10:00.689997] Epoch: [1]  [4330/6500]  lr: 0.000045  closs: 0.7334 (0.7610)  grad_norm: 0.3783 (0.4586)  time: 5.5752  data: 0.0001  max mem: 71357
+[10:10:56.358374] Epoch: [1]  [4340/6500]  lr: 0.000045  closs: 0.7335 (0.7611)  grad_norm: 0.4012 (0.4585)  time: 5.5702  data: 0.0001  max mem: 71357
+[10:11:52.042943] Epoch: [1]  [4350/6500]  lr: 0.000045  closs: 0.7335 (0.7611)  grad_norm: 0.4101 (0.4585)  time: 5.5675  data: 0.0001  max mem: 71357
+[10:12:47.814019] Epoch: [1]  [4360/6500]  lr: 0.000045  closs: 0.7259 (0.7610)  grad_norm: 0.3995 (0.4584)  time: 5.5727  data: 0.0001  max mem: 71357
+[10:13:43.550859] Epoch: [1]  [4370/6500]  lr: 0.000045  closs: 0.7493 (0.7610)  grad_norm: 0.3995 (0.4584)  time: 5.5753  data: 0.0001  max mem: 71357
+[10:14:39.253856] Epoch: [1]  [4380/6500]  lr: 0.000045  closs: 0.7821 (0.7611)  grad_norm: 0.4144 (0.4587)  time: 5.5719  data: 0.0001  max mem: 71357
+[10:15:35.182285] Epoch: [1]  [4390/6500]  lr: 0.000045  closs: 0.7537 (0.7612)  grad_norm: 0.3995 (0.4589)  time: 5.5815  data: 0.0001  max mem: 71357
+[10:16:30.985009] Epoch: [1]  [4400/6500]  lr: 0.000045  closs: 0.7537 (0.7611)  grad_norm: 0.3660 (0.4588)  time: 5.5864  data: 0.0001  max mem: 71357
+[10:17:26.686911] Epoch: [1]  [4410/6500]  lr: 0.000045  closs: 0.7020 (0.7611)  grad_norm: 0.3723 (0.4587)  time: 5.5751  data: 0.0001  max mem: 71357
+[10:18:22.372236] Epoch: [1]  [4420/6500]  lr: 0.000045  closs: 0.7378 (0.7611)  grad_norm: 0.3589 (0.4585)  time: 5.5692  data: 0.0001  max mem: 71357
+[10:19:18.157527] Epoch: [1]  [4430/6500]  lr: 0.000045  closs: 0.7378 (0.7611)  grad_norm: 0.3699 (0.4583)  time: 5.5734  data: 0.0001  max mem: 71357
+[10:20:14.052884] Epoch: [1]  [4440/6500]  lr: 0.000044  closs: 0.6983 (0.7610)  grad_norm: 0.3723 (0.4585)  time: 5.5839  data: 0.0002  max mem: 71357
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 0): env://, gpu 0
+[10:21:09.403398] > initializing model parallel with size 1
+[10:21:09.403479] > initializing ddp with size 2
+[10:21:09.403488] > initializing pipeline with size 1
+[10:21:09.450720] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[10:21:09.450818] Namespace(batch_size=4,
+accum_iter=2,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-70b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B',
+log_dir='./output_dir',
+save_interval=1,
+device='cuda',
+seed=0,
+resume='',
+num_workers=8,
+pin_mem=True,
+world_size=2,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[10:21:09.451703] Start initialization.
+[10:21:09.466825] Model Args:
+ ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[10:21:09.915438] Epoch: [1]  [4450/6500]  lr: 0.000044  closs: 0.6983 (0.7610)  grad_norm: 0.3699 (0.4583)  time: 5.5878  data: 0.0002  max mem: 71357
+[10:22:05.668596] Epoch: [1]  [4460/6500]  lr: 0.000044  closs: 0.7467 (0.7610)  grad_norm: 0.3758 (0.4583)  time: 5.5807  data: 0.0002  max mem: 71357
+[10:23:01.470643] Epoch: [1]  [4470/6500]  lr: 0.000044  closs: 0.7482 (0.7610)  grad_norm: 0.3644 (0.4581)  time: 5.5776  data: 0.0002  max mem: 71357
+[10:23:57.292617] Epoch: [1]  [4480/6500]  lr: 0.000044  closs: 0.7199 (0.7609)  grad_norm: 0.3613 (0.4579)  time: 5.5811  data: 0.0002  max mem: 71357
+[10:24:53.185106] Epoch: [1]  [4490/6500]  lr: 0.000044  closs: 0.8161 (0.7611)  grad_norm: 0.3613 (0.4580)  time: 5.5856  data: 0.0001  max mem: 71357
+[10:25:48.906594] Epoch: [1]  [4500/6500]  lr: 0.000044  closs: 0.8007 (0.7612)  grad_norm: 0.3793 (0.4581)  time: 5.5806  data: 0.0002  max mem: 71357
+[10:26:44.655847] Epoch: [1]  [4510/6500]  lr: 0.000044  closs: 0.7206 (0.7613)  grad_norm: 0.3905 (0.4579)  time: 5.5735  data: 0.0002  max mem: 71357
+[10:27:40.351074] Epoch: [1]  [4520/6500]  lr: 0.000044  closs: 0.7976 (0.7615)  grad_norm: 0.4285 (0.4580)  time: 5.5721  data: 0.0002  max mem: 71357
+[10:28:36.191258] Epoch: [1]  [4530/6500]  lr: 0.000044  closs: 0.7976 (0.7615)  grad_norm: 0.4008 (0.4579)  time: 5.5767  data: 0.0001  max mem: 71357
+[10:29:31.968038] Epoch: [1]  [4540/6500]  lr: 0.000044  closs: 0.7567 (0.7614)  grad_norm: 0.3905 (0.4577)  time: 5.5807  data: 0.0001  max mem: 71357
+[10:30:27.763567] Epoch: [1]  [4550/6500]  lr: 0.000044  closs: 0.7294 (0.7615)  grad_norm: 0.3837 (0.4576)  time: 5.5785  data: 0.0001  max mem: 71357
+[10:31:23.432711] Epoch: [1]  [4560/6500]  lr: 0.000044  closs: 0.7294 (0.7615)  grad_norm: 0.3770 (0.4577)  time: 5.5731  data: 0.0001  max mem: 71357
+[10:32:19.140978] Epoch: [1]  [4570/6500]  lr: 0.000044  closs: 0.7468 (0.7616)  grad_norm: 0.3770 (0.4577)  time: 5.5688  data: 0.0001  max mem: 71357
+[10:33:14.948775] Epoch: [1]  [4580/6500]  lr: 0.000044  closs: 0.7468 (0.7616)  grad_norm: 0.3815 (0.4577)  time: 5.5757  data: 0.0001  max mem: 71357
+[10:34:10.772732] Epoch: [1]  [4590/6500]  lr: 0.000044  closs: 0.7741 (0.7616)  grad_norm: 0.3815 (0.4577)  time: 5.5815  data: 0.0001  max mem: 71357
+[10:35:06.473382] Epoch: [1]  [4600/6500]  lr: 0.000044  closs: 0.7606 (0.7615)  grad_norm: 0.3815 (0.4576)  time: 5.5761  data: 0.0002  max mem: 71357
+[10:36:02.235458] Epoch: [1]  [4610/6500]  lr: 0.000044  closs: 0.6930 (0.7615)  grad_norm: 0.3815 (0.4577)  time: 5.5730  data: 0.0002  max mem: 71357
+[10:36:58.036756] Epoch: [1]  [4620/6500]  lr: 0.000044  closs: 0.7670 (0.7616)  grad_norm: 0.4095 (0.4576)  time: 5.5780  data: 0.0001  max mem: 71357
+[10:37:53.792580] Epoch: [1]  [4630/6500]  lr: 0.000044  closs: 0.7398 (0.7616)  grad_norm: 0.4037 (0.4574)  time: 5.5778  data: 0.0001  max mem: 71357
+[10:38:49.606204] Epoch: [1]  [4640/6500]  lr: 0.000044  closs: 0.7056 (0.7615)  grad_norm: 0.3881 (0.4572)  time: 5.5784  data: 0.0001  max mem: 71357
+[10:39:45.383787] Epoch: [1]  [4650/6500]  lr: 0.000044  closs: 0.7542 (0.7616)  grad_norm: 0.3881 (0.4574)  time: 5.5794  data: 0.0002  max mem: 71357
+[10:40:41.177173] Epoch: [1]  [4660/6500]  lr: 0.000044  closs: 0.7175 (0.7616)  grad_norm: 0.3625 (0.4574)  time: 5.5785  data: 0.0001  max mem: 71357
+[10:41:36.903399] Epoch: [1]  [4670/6500]  lr: 0.000044  closs: 0.6899 (0.7614)  grad_norm: 0.3625 (0.4574)  time: 5.5759  data: 0.0001  max mem: 71357
+[10:42:32.710916] Epoch: [1]  [4680/6500]  lr: 0.000044  closs: 0.7280 (0.7615)  grad_norm: 0.3585 (0.4571)  time: 5.5765  data: 0.0001  max mem: 71357
+[10:43:28.526448] Epoch: [1]  [4690/6500]  lr: 0.000044  closs: 0.7319 (0.7615)  grad_norm: 0.3625 (0.4570)  time: 5.5810  data: 0.0001  max mem: 71357
+[10:44:24.274780] Epoch: [1]  [4700/6500]  lr: 0.000044  closs: 0.7385 (0.7614)  grad_norm: 0.3723 (0.4571)  time: 5.5781  data: 0.0001  max mem: 71357
+[10:45:20.118274] Epoch: [1]  [4710/6500]  lr: 0.000044  closs: 0.7702 (0.7616)  grad_norm: 0.3723 (0.4570)  time: 5.5795  data: 0.0002  max mem: 71357
+[10:46:15.876910] Epoch: [1]  [4720/6500]  lr: 0.000044  closs: 0.8632 (0.7617)  grad_norm: 0.3725 (0.4569)  time: 5.5800  data: 0.0002  max mem: 71357
+[10:47:11.670606] Epoch: [1]  [4730/6500]  lr: 0.000044  closs: 0.7829 (0.7617)  grad_norm: 0.3538 (0.4566)  time: 5.5775  data: 0.0001  max mem: 71357
+[10:48:07.502727] Epoch: [1]  [4740/6500]  lr: 0.000044  closs: 0.8662 (0.7620)  grad_norm: 0.3451 (0.4564)  time: 5.5812  data: 0.0001  max mem: 71357
+[10:49:03.350525] Epoch: [1]  [4750/6500]  lr: 0.000044  closs: 0.8316 (0.7620)  grad_norm: 0.3455 (0.4564)  time: 5.5839  data: 0.0001  max mem: 71357
+[10:49:59.088820] Epoch: [1]  [4760/6500]  lr: 0.000044  closs: 0.7709 (0.7620)  grad_norm: 0.3764 (0.4564)  time: 5.5792  data: 0.0002  max mem: 71357
+[10:50:54.918656] Epoch: [1]  [4770/6500]  lr: 0.000044  closs: 0.7709 (0.7620)  grad_norm: 0.4139 (0.4562)  time: 5.5783  data: 0.0002  max mem: 71357
+[10:51:50.661923] Epoch: [1]  [4780/6500]  lr: 0.000044  closs: 0.7886 (0.7621)  grad_norm: 0.4284 (0.4565)  time: 5.5786  data: 0.0001  max mem: 71357
+[10:52:46.468233] Epoch: [1]  [4790/6500]  lr: 0.000044  closs: 0.8177 (0.7621)  grad_norm: 0.4017 (0.4562)  time: 5.5774  data: 0.0001  max mem: 71357
+[10:53:42.298368] Epoch: [1]  [4800/6500]  lr: 0.000044  closs: 0.8177 (0.7623)  grad_norm: 0.3854 (0.4563)  time: 5.5817  data: 0.0001  max mem: 71357
+[10:54:37.963093] Epoch: [1]  [4810/6500]  lr: 0.000044  closs: 0.7960 (0.7625)  grad_norm: 0.3854 (0.4572)  time: 5.5746  data: 0.0001  max mem: 71357
+[10:55:33.724578] Epoch: [1]  [4820/6500]  lr: 0.000044  closs: 0.7233 (0.7623)  grad_norm: 0.3701 (0.4571)  time: 5.5712  data: 0.0001  max mem: 71357
+[10:56:29.502489] Epoch: [1]  [4830/6500]  lr: 0.000044  closs: 0.7313 (0.7624)  grad_norm: 0.3720 (0.4579)  time: 5.5769  data: 0.0001  max mem: 71357
+[10:57:25.285468] Epoch: [1]  [4840/6500]  lr: 0.000043  closs: 0.7270 (0.7623)  grad_norm: 0.3717 (0.4578)  time: 5.5780  data: 0.0001  max mem: 71357
+[10:58:20.999566] Epoch: [1]  [4850/6500]  lr: 0.000043  closs: 0.7105 (0.7623)  grad_norm: 0.4146 (0.4578)  time: 5.5747  data: 0.0001  max mem: 71357
+[10:59:16.743808] Epoch: [1]  [4860/6500]  lr: 0.000043  closs: 0.7434 (0.7624)  grad_norm: 0.4005 (0.4577)  time: 5.5728  data: 0.0001  max mem: 71357
+[11:00:12.471000] Epoch: [1]  [4870/6500]  lr: 0.000043  closs: 0.8188 (0.7625)  grad_norm: 0.4129 (0.4576)  time: 5.5734  data: 0.0002  max mem: 71357
+[11:01:08.334659] Epoch: [1]  [4880/6500]  lr: 0.000043  closs: 0.7880 (0.7625)  grad_norm: 0.3817 (0.4573)  time: 5.5794  data: 0.0002  max mem: 71357
+[11:02:04.159214] Epoch: [1]  [4890/6500]  lr: 0.000043  closs: 0.7651 (0.7626)  grad_norm: 0.3529 (0.4571)  time: 5.5843  data: 0.0001  max mem: 71357
+[11:02:59.872216] Epoch: [1]  [4900/6500]  lr: 0.000043  closs: 0.7651 (0.7626)  grad_norm: 0.3376 (0.4570)  time: 5.5768  data: 0.0001  max mem: 71357
+[11:03:55.655000] Epoch: [1]  [4910/6500]  lr: 0.000043  closs: 0.7754 (0.7626)  grad_norm: 0.3316 (0.4568)  time: 5.5747  data: 0.0001  max mem: 71357
+[11:04:51.487749] Epoch: [1]  [4920/6500]  lr: 0.000043  closs: 0.7336 (0.7625)  grad_norm: 0.3376 (0.4566)  time: 5.5806  data: 0.0002  max mem: 71357
+[11:05:47.384044] Epoch: [1]  [4930/6500]  lr: 0.000043  closs: 0.7084 (0.7624)  grad_norm: 0.3480 (0.4565)  time: 5.5863  data: 0.0002  max mem: 71357
+[11:06:43.177011] Epoch: [1]  [4940/6500]  lr: 0.000043  closs: 0.7459 (0.7623)  grad_norm: 0.3427 (0.4563)  time: 5.5843  data: 0.0001  max mem: 71357
+[11:07:38.843228] Epoch: [1]  [4950/6500]  lr: 0.000043  closs: 0.7459 (0.7623)  grad_norm: 0.3660 (0.4564)  time: 5.5728  data: 0.0001  max mem: 71357
+[11:08:34.583423] Epoch: [1]  [4960/6500]  lr: 0.000043  closs: 0.7790 (0.7624)  grad_norm: 0.4092 (0.4563)  time: 5.5702  data: 0.0001  max mem: 71357
+[11:09:30.439676] Epoch: [1]  [4970/6500]  lr: 0.000043  closs: 0.7991 (0.7624)  grad_norm: 0.4092 (0.4561)  time: 5.5797  data: 0.0001  max mem: 71357
+[11:10:26.188771] Epoch: [1]  [4980/6500]  lr: 0.000043  closs: 0.7671 (0.7624)  grad_norm: 0.3963 (0.4561)  time: 5.5801  data: 0.0002  max mem: 71357
+[11:11:21.946425] Epoch: [1]  [4990/6500]  lr: 0.000043  closs: 0.7342 (0.7623)  grad_norm: 0.3800 (0.4560)  time: 5.5752  data: 0.0002  max mem: 71357
+[11:12:17.744689] Epoch: [1]  [5000/6500]  lr: 0.000043  closs: 0.7342 (0.7622)  grad_norm: 0.3800 (0.4565)  time: 5.5777  data: 0.0001  max mem: 71357
+[11:13:13.585475] Epoch: [1]  [5010/6500]  lr: 0.000043  closs: 0.7470 (0.7621)  grad_norm: 0.3830 (0.4563)  time: 5.5818  data: 0.0001  max mem: 71357
+[11:14:09.395279] Epoch: [1]  [5020/6500]  lr: 0.000043  closs: 0.7470 (0.7621)  grad_norm: 0.3689 (0.4561)  time: 5.5824  data: 0.0001  max mem: 71357
+[11:15:05.164880] Epoch: [1]  [5030/6500]  lr: 0.000043  closs: 0.7894 (0.7623)  grad_norm: 0.3559 (0.4563)  time: 5.5788  data: 0.0002  max mem: 71357
+[11:16:00.882700] Epoch: [1]  [5040/6500]  lr: 0.000043  closs: 0.7406 (0.7621)  grad_norm: 0.3560 (0.4567)  time: 5.5742  data: 0.0002  max mem: 71357
+[11:16:56.643127] Epoch: [1]  [5050/6500]  lr: 0.000043  closs: 0.7206 (0.7621)  grad_norm: 0.3608 (0.4565)  time: 5.5738  data: 0.0001  max mem: 71357
+[11:17:52.504961] Epoch: [1]  [5060/6500]  lr: 0.000043  closs: 0.7532 (0.7621)  grad_norm: 0.3612 (0.4564)  time: 5.5810  data: 0.0001  max mem: 71357
+[11:18:48.234806] Epoch: [1]  [5070/6500]  lr: 0.000043  closs: 0.7623 (0.7620)  grad_norm: 0.3612 (0.4564)  time: 5.5795  data: 0.0001  max mem: 71357
+[11:19:44.087526] Epoch: [1]  [5080/6500]  lr: 0.000043  closs: 0.6898 (0.7620)  grad_norm: 0.3674 (0.4563)  time: 5.5790  data: 0.0001  max mem: 71357
+[11:20:39.901787] Epoch: [1]  [5090/6500]  lr: 0.000043  closs: 0.6929 (0.7620)  grad_norm: 0.3674 (0.4563)  time: 5.5832  data: 0.0001  max mem: 71357
+[11:21:35.717532] Epoch: [1]  [5100/6500]  lr: 0.000043  closs: 0.7438 (0.7620)  grad_norm: 0.3975 (0.4565)  time: 5.5814  data: 0.0001  max mem: 71357
+[11:22:31.404559] Epoch: [1]  [5110/6500]  lr: 0.000043  closs: 0.7661 (0.7621)  grad_norm: 0.4491 (0.4566)  time: 5.5750  data: 0.0001  max mem: 71357
+[11:23:27.158982] Epoch: [1]  [5120/6500]  lr: 0.000043  closs: 0.8052 (0.7622)  grad_norm: 0.4491 (0.4565)  time: 5.5720  data: 0.0001  max mem: 71357
+[11:24:22.940301] Epoch: [1]  [5130/6500]  lr: 0.000043  closs: 0.7489 (0.7619)  grad_norm: 0.4491 (0.4564)  time: 5.5767  data: 0.0001  max mem: 71357
+[11:25:18.734962] Epoch: [1]  [5140/6500]  lr: 0.000043  closs: 0.7677 (0.7621)  grad_norm: 0.4062 (0.4563)  time: 5.5787  data: 0.0002  max mem: 71357
+[11:26:14.603090] Epoch: [1]  [5150/6500]  lr: 0.000043  closs: 0.7984 (0.7621)  grad_norm: 0.3606 (0.4563)  time: 5.5830  data: 0.0002  max mem: 71357
+[11:27:10.315042] Epoch: [1]  [5160/6500]  lr: 0.000043  closs: 0.7870 (0.7623)  grad_norm: 0.3626 (0.4561)  time: 5.5789  data: 0.0001  max mem: 71357
+[11:28:05.963749] Epoch: [1]  [5170/6500]  lr: 0.000043  closs: 0.8181 (0.7624)  grad_norm: 0.3642 (0.4562)  time: 5.5679  data: 0.0001  max mem: 71357
+[11:29:01.772870] Epoch: [1]  [5180/6500]  lr: 0.000043  closs: 0.7284 (0.7623)  grad_norm: 0.3776 (0.4562)  time: 5.5728  data: 0.0001  max mem: 71357
+[11:29:57.532695] Epoch: [1]  [5190/6500]  lr: 0.000043  closs: 0.7601 (0.7623)  grad_norm: 0.3662 (0.4561)  time: 5.5783  data: 0.0002  max mem: 71357
+[11:30:53.205321] Epoch: [1]  [5200/6500]  lr: 0.000043  closs: 0.7734 (0.7623)  grad_norm: 0.4361 (0.4561)  time: 5.5715  data: 0.0002  max mem: 71357
+[11:31:48.890650] Epoch: [1]  [5210/6500]  lr: 0.000043  closs: 0.7016 (0.7622)  grad_norm: 0.4014 (0.4561)  time: 5.5678  data: 0.0001  max mem: 71357
+[11:32:44.490303] Epoch: [1]  [5220/6500]  lr: 0.000043  closs: 0.7215 (0.7623)  grad_norm: 0.4014 (0.4563)  time: 5.5642  data: 0.0001  max mem: 71357
+[11:33:40.288737] Epoch: [1]  [5230/6500]  lr: 0.000042  closs: 0.6987 (0.7620)  grad_norm: 0.4014 (0.4562)  time: 5.5698  data: 0.0001  max mem: 71357
+[11:34:36.201586] Epoch: [1]  [5240/6500]  lr: 0.000042  closs: 0.7130 (0.7620)  grad_norm: 0.3873 (0.4561)  time: 5.5855  data: 0.0001  max mem: 71357
+[11:35:31.926282] Epoch: [1]  [5250/6500]  lr: 0.000042  closs: 0.7390 (0.7619)  grad_norm: 0.3751 (0.4559)  time: 5.5818  data: 0.0002  max mem: 71357
+[11:36:27.605601] Epoch: [1]  [5260/6500]  lr: 0.000042  closs: 0.6863 (0.7618)  grad_norm: 0.3751 (0.4560)  time: 5.5701  data: 0.0002  max mem: 71357
+[11:37:23.356717] Epoch: [1]  [5270/6500]  lr: 0.000042  closs: 0.7545 (0.7619)  grad_norm: 0.3868 (0.4560)  time: 5.5714  data: 0.0001  max mem: 71357
+[11:38:19.052455] Epoch: [1]  [5280/6500]  lr: 0.000042  closs: 0.7771 (0.7618)  grad_norm: 0.4176 (0.4562)  time: 5.5723  data: 0.0001  max mem: 71357
+[11:39:14.810553] Epoch: [1]  [5290/6500]  lr: 0.000042  closs: 0.7234 (0.7619)  grad_norm: 0.4176 (0.4560)  time: 5.5726  data: 0.0001  max mem: 71357
+[11:40:10.478168] Epoch: [1]  [5300/6500]  lr: 0.000042  closs: 0.7234 (0.7619)  grad_norm: 0.3894 (0.4561)  time: 5.5712  data: 0.0002  max mem: 71357
+[11:41:06.204522] Epoch: [1]  [5310/6500]  lr: 0.000042  closs: 0.8350 (0.7620)  grad_norm: 0.4485 (0.4564)  time: 5.5696  data: 0.0002  max mem: 71357
+[11:42:02.028377] Epoch: [1]  [5320/6500]  lr: 0.000042  closs: 0.8021 (0.7620)  grad_norm: 0.3894 (0.4565)  time: 5.5774  data: 0.0001  max mem: 71357
+[11:42:57.762208] Epoch: [1]  [5330/6500]  lr: 0.000042  closs: 0.7210 (0.7620)  grad_norm: 0.4442 (0.4564)  time: 5.5778  data: 0.0001  max mem: 71357
+[11:43:53.467868] Epoch: [1]  [5340/6500]  lr: 0.000042  closs: 0.7326 (0.7621)  grad_norm: 0.4205 (0.4564)  time: 5.5719  data: 0.0001  max mem: 71357
+[11:44:49.249976] Epoch: [1]  [5350/6500]  lr: 0.000042  closs: 0.7308 (0.7620)  grad_norm: 0.3894 (0.4562)  time: 5.5743  data: 0.0001  max mem: 71357
+[11:45:44.957778] Epoch: [1]  [5360/6500]  lr: 0.000042  closs: 0.6752 (0.7620)  grad_norm: 0.3891 (0.4562)  time: 5.5744  data: 0.0001  max mem: 71357
+[11:46:40.748059] Epoch: [1]  [5370/6500]  lr: 0.000042  closs: 0.6930 (0.7618)  grad_norm: 0.3779 (0.4562)  time: 5.5748  data: 0.0001  max mem: 71357
+[11:47:36.454319] Epoch: [1]  [5380/6500]  lr: 0.000042  closs: 0.7712 (0.7620)  grad_norm: 0.3879 (0.4562)  time: 5.5747  data: 0.0001  max mem: 71357
+[11:48:32.242290] Epoch: [1]  [5390/6500]  lr: 0.000042  closs: 0.8651 (0.7621)  grad_norm: 0.3879 (0.4562)  time: 5.5746  data: 0.0001  max mem: 71357
+[11:49:27.957643] Epoch: [1]  [5400/6500]  lr: 0.000042  closs: 0.7200 (0.7620)  grad_norm: 0.4342 (0.4562)  time: 5.5751  data: 0.0001  max mem: 71357
+[11:50:23.766466] Epoch: [1]  [5410/6500]  lr: 0.000042  closs: 0.7646 (0.7621)  grad_norm: 0.4439 (0.4564)  time: 5.5761  data: 0.0001  max mem: 71357
+[11:51:19.479821] Epoch: [1]  [5420/6500]  lr: 0.000042  closs: 0.7639 (0.7620)  grad_norm: 0.4558 (0.4567)  time: 5.5760  data: 0.0001  max mem: 71357
+[11:52:15.244104] Epoch: [1]  [5430/6500]  lr: 0.000042  closs: 0.7082 (0.7620)  grad_norm: 0.4558 (0.4565)  time: 5.5738  data: 0.0001  max mem: 71357
+[11:53:10.897641] Epoch: [1]  [5440/6500]  lr: 0.000042  closs: 0.7920 (0.7621)  grad_norm: 0.4017 (0.4564)  time: 5.5708  data: 0.0001  max mem: 71357
+[11:54:06.694640] Epoch: [1]  [5450/6500]  lr: 0.000042  closs: 0.7965 (0.7622)  grad_norm: 0.4201 (0.4564)  time: 5.5724  data: 0.0001  max mem: 71357
+[11:55:02.484616] Epoch: [1]  [5460/6500]  lr: 0.000042  closs: 0.7598 (0.7622)  grad_norm: 0.3805 (0.4563)  time: 5.5793  data: 0.0001  max mem: 71357
+[11:55:58.257271] Epoch: [1]  [5470/6500]  lr: 0.000042  closs: 0.7568 (0.7622)  grad_norm: 0.3925 (0.4562)  time: 5.5781  data: 0.0001  max mem: 71357
+[11:56:53.980040] Epoch: [1]  [5480/6500]  lr: 0.000042  closs: 0.7271 (0.7621)  grad_norm: 0.3925 (0.4563)  time: 5.5747  data: 0.0001  max mem: 71357
+[11:57:49.775101] Epoch: [1]  [5490/6500]  lr: 0.000042  closs: 0.7271 (0.7620)  grad_norm: 0.3860 (0.4561)  time: 5.5758  data: 0.0001  max mem: 71357
+[11:58:45.515824] Epoch: [1]  [5500/6500]  lr: 0.000042  closs: 0.7487 (0.7620)  grad_norm: 0.3860 (0.4560)  time: 5.5767  data: 0.0001  max mem: 71357
+[11:59:41.216243] Epoch: [1]  [5510/6500]  lr: 0.000042  closs: 0.7174 (0.7619)  grad_norm: 0.4166 (0.4561)  time: 5.5720  data: 0.0001  max mem: 71357
+[12:00:36.966815] Epoch: [1]  [5520/6500]  lr: 0.000042  closs: 0.7540 (0.7621)  grad_norm: 0.3887 (0.4560)  time: 5.5725  data: 0.0001  max mem: 71357
+[12:01:32.831173] Epoch: [1]  [5530/6500]  lr: 0.000042  closs: 0.7546 (0.7621)  grad_norm: 0.4038 (0.4560)  time: 5.5807  data: 0.0001  max mem: 71357
+[12:02:28.647159] Epoch: [1]  [5540/6500]  lr: 0.000042  closs: 0.7516 (0.7621)  grad_norm: 0.4075 (0.4560)  time: 5.5839  data: 0.0001  max mem: 71357
+[12:03:24.364310] Epoch: [1]  [5550/6500]  lr: 0.000042  closs: 0.7670 (0.7621)  grad_norm: 0.3887 (0.4558)  time: 5.5766  data: 0.0001  max mem: 71357
+[12:04:20.155378] Epoch: [1]  [5560/6500]  lr: 0.000042  closs: 0.7699 (0.7622)  grad_norm: 0.3411 (0.4556)  time: 5.5753  data: 0.0001  max mem: 71357
+[12:05:15.851908] Epoch: [1]  [5570/6500]  lr: 0.000042  closs: 0.7497 (0.7622)  grad_norm: 0.3411 (0.4555)  time: 5.5743  data: 0.0001  max mem: 71357
+[12:06:11.602612] Epoch: [1]  [5580/6500]  lr: 0.000042  closs: 0.7377 (0.7622)  grad_norm: 0.3426 (0.4556)  time: 5.5723  data: 0.0001  max mem: 71357
+[12:07:07.492817] Epoch: [1]  [5590/6500]  lr: 0.000041  closs: 0.7341 (0.7622)  grad_norm: 0.3762 (0.4555)  time: 5.5820  data: 0.0001  max mem: 71357
+[12:08:03.293439] Epoch: [1]  [5600/6500]  lr: 0.000041  closs: 0.7341 (0.7623)  grad_norm: 0.4020 (0.4553)  time: 5.5845  data: 0.0001  max mem: 71357
+[12:08:58.953646] Epoch: [1]  [5610/6500]  lr: 0.000041  closs: 0.7692 (0.7621)  grad_norm: 0.4286 (0.4556)  time: 5.5730  data: 0.0001  max mem: 71357
+[12:09:54.762282] Epoch: [1]  [5620/6500]  lr: 0.000041  closs: 0.7133 (0.7621)  grad_norm: 0.3748 (0.4555)  time: 5.5733  data: 0.0001  max mem: 71357
+[12:10:50.637061] Epoch: [1]  [5630/6500]  lr: 0.000041  closs: 0.7529 (0.7622)  grad_norm: 0.3843 (0.4555)  time: 5.5841  data: 0.0001  max mem: 71357
+[12:11:46.338076] Epoch: [1]  [5640/6500]  lr: 0.000041  closs: 0.8054 (0.7622)  grad_norm: 0.3843 (0.4553)  time: 5.5787  data: 0.0001  max mem: 71357
+[12:12:42.052979] Epoch: [1]  [5650/6500]  lr: 0.000041  closs: 0.8054 (0.7623)  grad_norm: 0.3795 (0.4552)  time: 5.5707  data: 0.0001  max mem: 71357
+[12:13:37.757138] Epoch: [1]  [5660/6500]  lr: 0.000041  closs: 0.7757 (0.7623)  grad_norm: 0.3795 (0.4552)  time: 5.5709  data: 0.0001  max mem: 71357
+[12:14:33.519114] Epoch: [1]  [5670/6500]  lr: 0.000041  closs: 0.7771 (0.7623)  grad_norm: 0.3751 (0.4552)  time: 5.5732  data: 0.0001  max mem: 71357
+[12:15:29.262342] Epoch: [1]  [5680/6500]  lr: 0.000041  closs: 0.8003 (0.7625)  grad_norm: 0.3742 (0.4550)  time: 5.5752  data: 0.0001  max mem: 71357
+[12:16:24.960305] Epoch: [1]  [5690/6500]  lr: 0.000041  closs: 0.7774 (0.7625)  grad_norm: 0.3751 (0.4549)  time: 5.5720  data: 0.0001  max mem: 71357
+[12:17:20.818451] Epoch: [1]  [5700/6500]  lr: 0.000041  closs: 0.7509 (0.7625)  grad_norm: 0.3829 (0.4548)  time: 5.5777  data: 0.0001  max mem: 71357
+[12:18:16.589827] Epoch: [1]  [5710/6500]  lr: 0.000041  closs: 0.7242 (0.7625)  grad_norm: 0.3742 (0.4547)  time: 5.5814  data: 0.0001  max mem: 71357
+[12:19:12.364366] Epoch: [1]  [5720/6500]  lr: 0.000041  closs: 0.7242 (0.7624)  grad_norm: 0.4329 (0.4547)  time: 5.5772  data: 0.0002  max mem: 71357
+[12:20:08.204706] Epoch: [1]  [5730/6500]  lr: 0.000041  closs: 0.7491 (0.7624)  grad_norm: 0.3653 (0.4545)  time: 5.5806  data: 0.0002  max mem: 71357
+[12:21:03.861751] Epoch: [1]  [5740/6500]  lr: 0.000041  closs: 0.7838 (0.7624)  grad_norm: 0.3653 (0.4545)  time: 5.5747  data: 0.0001  max mem: 71357
+[12:21:59.718259] Epoch: [1]  [5750/6500]  lr: 0.000041  closs: 0.7416 (0.7624)  grad_norm: 0.3975 (0.4546)  time: 5.5756  data: 0.0001  max mem: 71357
+[12:22:55.524356] Epoch: [1]  [5760/6500]  lr: 0.000041  closs: 0.7056 (0.7623)  grad_norm: 0.3606 (0.4545)  time: 5.5830  data: 0.0001  max mem: 71357
+[12:23:51.380407] Epoch: [1]  [5770/6500]  lr: 0.000041  closs: 0.7723 (0.7624)  grad_norm: 0.3858 (0.4544)  time: 5.5830  data: 0.0001  max mem: 71357
+[12:24:47.150261] Epoch: [1]  [5780/6500]  lr: 0.000041  closs: 0.7954 (0.7623)  grad_norm: 0.4207 (0.4544)  time: 5.5812  data: 0.0001  max mem: 71357
+[12:25:42.827316] Epoch: [1]  [5790/6500]  lr: 0.000041  closs: 0.6956 (0.7622)  grad_norm: 0.3718 (0.4544)  time: 5.5722  data: 0.0002  max mem: 71357
+[12:26:38.608622] Epoch: [1]  [5800/6500]  lr: 0.000041  closs: 0.6790 (0.7621)  grad_norm: 0.3812 (0.4543)  time: 5.5728  data: 0.0001  max mem: 71357
+[12:27:34.355703] Epoch: [1]  [5810/6500]  lr: 0.000041  closs: 0.6473 (0.7620)  grad_norm: 0.4115 (0.4544)  time: 5.5763  data: 0.0001  max mem: 71357
+[12:28:30.033082] Epoch: [1]  [5820/6500]  lr: 0.000041  closs: 0.6839 (0.7620)  grad_norm: 0.4103 (0.4543)  time: 5.5711  data: 0.0001  max mem: 71357
+[12:29:25.690915] Epoch: [1]  [5830/6500]  lr: 0.000041  closs: 0.7630 (0.7620)  grad_norm: 0.4103 (0.4544)  time: 5.5667  data: 0.0001  max mem: 71357
+[12:30:21.426603] Epoch: [1]  [5840/6500]  lr: 0.000041  closs: 0.7711 (0.7621)  grad_norm: 0.4103 (0.4542)  time: 5.5696  data: 0.0001  max mem: 71357
+[12:31:17.292434] Epoch: [1]  [5850/6500]  lr: 0.000041  closs: 0.8304 (0.7622)  grad_norm: 0.3960 (0.4541)  time: 5.5800  data: 0.0001  max mem: 71357
+[12:32:12.900412] Epoch: [1]  [5860/6500]  lr: 0.000041  closs: 0.7820 (0.7621)  grad_norm: 0.3960 (0.4541)  time: 5.5736  data: 0.0001  max mem: 71357
+[12:33:08.766672] Epoch: [1]  [5870/6500]  lr: 0.000041  closs: 0.6789 (0.7621)  grad_norm: 0.3778 (0.4540)  time: 5.5736  data: 0.0001  max mem: 71357
+[12:34:04.393916] Epoch: [1]  [5880/6500]  lr: 0.000041  closs: 0.7246 (0.7622)  grad_norm: 0.3856 (0.4547)  time: 5.5746  data: 0.0001  max mem: 71357
+[12:35:00.125761] Epoch: [1]  [5890/6500]  lr: 0.000041  closs: 0.7455 (0.7621)  grad_norm: 0.3856 (0.4546)  time: 5.5679  data: 0.0001  max mem: 71357
+[12:35:55.981470] Epoch: [1]  [5900/6500]  lr: 0.000041  closs: 0.7150 (0.7620)  grad_norm: 0.3649 (0.4545)  time: 5.5793  data: 0.0001  max mem: 71357
+[12:36:51.710443] Epoch: [1]  [5910/6500]  lr: 0.000041  closs: 0.7150 (0.7620)  grad_norm: 0.3649 (0.4544)  time: 5.5792  data: 0.0001  max mem: 71357
+[12:37:47.571872] Epoch: [1]  [5920/6500]  lr: 0.000041  closs: 0.6987 (0.7619)  grad_norm: 0.3471 (0.4542)  time: 5.5794  data: 0.0001  max mem: 71357
+[12:38:43.443859] Epoch: [1]  [5930/6500]  lr: 0.000040  closs: 0.6987 (0.7619)  grad_norm: 0.3331 (0.4540)  time: 5.5866  data: 0.0001  max mem: 71357
+[12:39:39.343282] Epoch: [1]  [5940/6500]  lr: 0.000040  closs: 0.7192 (0.7619)  grad_norm: 0.3435 (0.4541)  time: 5.5884  data: 0.0001  max mem: 71357
+[12:40:35.146723] Epoch: [1]  [5950/6500]  lr: 0.000040  closs: 0.7379 (0.7619)  grad_norm: 0.3435 (0.4539)  time: 5.5850  data: 0.0002  max mem: 71357
+[12:41:30.902370] Epoch: [1]  [5960/6500]  lr: 0.000040  closs: 0.7951 (0.7619)  grad_norm: 0.3445 (0.4538)  time: 5.5779  data: 0.0002  max mem: 71357
+[12:42:26.717992] Epoch: [1]  [5970/6500]  lr: 0.000040  closs: 0.7941 (0.7620)  grad_norm: 0.4037 (0.4539)  time: 5.5785  data: 0.0002  max mem: 71357
+[12:43:22.562358] Epoch: [1]  [5980/6500]  lr: 0.000040  closs: 0.7629 (0.7621)  grad_norm: 0.3538 (0.4538)  time: 5.5829  data: 0.0002  max mem: 71357
+[12:44:18.336240] Epoch: [1]  [5990/6500]  lr: 0.000040  closs: 0.7501 (0.7620)  grad_norm: 0.3538 (0.4536)  time: 5.5808  data: 0.0001  max mem: 71357
+[12:45:14.046587] Epoch: [1]  [6000/6500]  lr: 0.000040  closs: 0.6951 (0.7620)  grad_norm: 0.3621 (0.4535)  time: 5.5741  data: 0.0002  max mem: 71357
+[12:46:09.836934] Epoch: [1]  [6010/6500]  lr: 0.000040  closs: 0.6929 (0.7619)  grad_norm: 0.3621 (0.4534)  time: 5.5749  data: 0.0002  max mem: 71357
+[12:47:05.668098] Epoch: [1]  [6020/6500]  lr: 0.000040  closs: 0.7398 (0.7619)  grad_norm: 0.3923 (0.4538)  time: 5.5810  data: 0.0002  max mem: 71357
+[12:48:01.635878] Epoch: [1]  [6030/6500]  lr: 0.000040  closs: 0.7508 (0.7619)  grad_norm: 0.4124 (0.4537)  time: 5.5898  data: 0.0002  max mem: 71357
+[12:48:57.359248] Epoch: [1]  [6040/6500]  lr: 0.000040  closs: 0.7133 (0.7618)  grad_norm: 0.4416 (0.4538)  time: 5.5845  data: 0.0002  max mem: 71357
+[12:49:53.165180] Epoch: [1]  [6050/6500]  lr: 0.000040  closs: 0.7493 (0.7620)  grad_norm: 0.4416 (0.4538)  time: 5.5764  data: 0.0001  max mem: 71357
+[12:50:49.040989] Epoch: [1]  [6060/6500]  lr: 0.000040  closs: 0.8315 (0.7620)  grad_norm: 0.4096 (0.4538)  time: 5.5840  data: 0.0002  max mem: 71357
+[12:51:44.970720] Epoch: [1]  [6070/6500]  lr: 0.000040  closs: 0.7805 (0.7621)  grad_norm: 0.4065 (0.4536)  time: 5.5902  data: 0.0002  max mem: 71357
+[12:52:40.669392] Epoch: [1]  [6080/6500]  lr: 0.000040  closs: 0.8480 (0.7623)  grad_norm: 0.4065 (0.4539)  time: 5.5813  data: 0.0001  max mem: 71357
+[12:53:36.427897] Epoch: [1]  [6090/6500]  lr: 0.000040  closs: 0.8226 (0.7624)  grad_norm: 0.4065 (0.4540)  time: 5.5728  data: 0.0001  max mem: 71357
+[12:54:32.195913] Epoch: [1]  [6100/6500]  lr: 0.000040  closs: 0.7264 (0.7623)  grad_norm: 0.3978 (0.4539)  time: 5.5763  data: 0.0001  max mem: 71357
+[12:55:27.987231] Epoch: [1]  [6110/6500]  lr: 0.000040  closs: 0.8074 (0.7625)  grad_norm: 0.4201 (0.4538)  time: 5.5779  data: 0.0001  max mem: 71357
+[12:56:23.829834] Epoch: [1]  [6120/6500]  lr: 0.000040  closs: 0.8245 (0.7626)  grad_norm: 0.3929 (0.4537)  time: 5.5816  data: 0.0001  max mem: 71357
+[12:57:19.521242] Epoch: [1]  [6130/6500]  lr: 0.000040  closs: 0.8077 (0.7628)  grad_norm: 0.3743 (0.4536)  time: 5.5766  data: 0.0001  max mem: 71357
+[12:58:15.203234] Epoch: [1]  [6140/6500]  lr: 0.000040  closs: 0.7957 (0.7627)  grad_norm: 0.3747 (0.4556)  time: 5.5686  data: 0.0001  max mem: 71357
+[12:59:10.946149] Epoch: [1]  [6150/6500]  lr: 0.000040  closs: 0.7957 (0.7628)  grad_norm: 0.3743 (0.4555)  time: 5.5712  data: 0.0001  max mem: 71357
+[13:00:06.841654] Epoch: [1]  [6160/6500]  lr: 0.000040  closs: 0.8200 (0.7629)  grad_norm: 0.3747 (0.4554)  time: 5.5818  data: 0.0001  max mem: 71357
+[13:01:02.559307] Epoch: [1]  [6170/6500]  lr: 0.000040  closs: 0.7666 (0.7628)  grad_norm: 0.3837 (0.4553)  time: 5.5806  data: 0.0001  max mem: 71357
+[13:01:58.277625] Epoch: [1]  [6180/6500]  lr: 0.000040  closs: 0.7439 (0.7629)  grad_norm: 0.3926 (0.4553)  time: 5.5717  data: 0.0001  max mem: 71357
+[13:02:54.029871] Epoch: [1]  [6190/6500]  lr: 0.000040  closs: 0.8000 (0.7629)  grad_norm: 0.3897 (0.4552)  time: 5.5734  data: 0.0001  max mem: 71357
+[13:03:49.873703] Epoch: [1]  [6200/6500]  lr: 0.000040  closs: 0.7667 (0.7628)  grad_norm: 0.3926 (0.4552)  time: 5.5797  data: 0.0001  max mem: 71357
+[13:04:45.563891] Epoch: [1]  [6210/6500]  lr: 0.000040  closs: 0.7751 (0.7629)  grad_norm: 0.4210 (0.4552)  time: 5.5766  data: 0.0001  max mem: 71357
+[13:05:41.287468] Epoch: [1]  [6220/6500]  lr: 0.000040  closs: 0.7010 (0.7627)  grad_norm: 0.3957 (0.4551)  time: 5.5706  data: 0.0002  max mem: 71357
+[13:06:37.089385] Epoch: [1]  [6230/6500]  lr: 0.000040  closs: 0.7090 (0.7628)  grad_norm: 0.4210 (0.4550)  time: 5.5762  data: 0.0002  max mem: 71357
+[13:07:32.868772] Epoch: [1]  [6240/6500]  lr: 0.000040  closs: 0.7651 (0.7628)  grad_norm: 0.3920 (0.4550)  time: 5.5790  data: 0.0001  max mem: 71357
+[13:08:28.652568] Epoch: [1]  [6250/6500]  lr: 0.000040  closs: 0.7620 (0.7629)  grad_norm: 0.3703 (0.4549)  time: 5.5781  data: 0.0001  max mem: 71357
+[13:09:24.362382] Epoch: [1]  [6260/6500]  lr: 0.000039  closs: 0.7858 (0.7629)  grad_norm: 0.3703 (0.4548)  time: 5.5746  data: 0.0001  max mem: 71357
+[13:10:20.032595] Epoch: [1]  [6270/6500]  lr: 0.000039  closs: 0.7305 (0.7628)  grad_norm: 0.3763 (0.4549)  time: 5.5689  data: 0.0001  max mem: 71357
+[13:11:15.731095] Epoch: [1]  [6280/6500]  lr: 0.000039  closs: 0.7532 (0.7629)  grad_norm: 0.3879 (0.4550)  time: 5.5683  data: 0.0001  max mem: 71357
+[13:12:11.442048] Epoch: [1]  [6290/6500]  lr: 0.000039  closs: 0.7507 (0.7628)  grad_norm: 0.4623 (0.4552)  time: 5.5703  data: 0.0001  max mem: 71357
+[13:13:07.132957] Epoch: [1]  [6300/6500]  lr: 0.000039  closs: 0.7333 (0.7628)  grad_norm: 0.3946 (0.4550)  time: 5.5700  data: 0.0001  max mem: 71357
+[13:14:02.908037] Epoch: [1]  [6310/6500]  lr: 0.000039  closs: 0.7169 (0.7626)  grad_norm: 0.4401 (0.4552)  time: 5.5732  data: 0.0001  max mem: 71357
+[13:14:58.686738] Epoch: [1]  [6320/6500]  lr: 0.000039  closs: 0.7381 (0.7627)  grad_norm: 0.3946 (0.4552)  time: 5.5776  data: 0.0001  max mem: 71357
+[13:15:54.502734] Epoch: [1]  [6330/6500]  lr: 0.000039  closs: 0.7568 (0.7627)  grad_norm: 0.3855 (0.4550)  time: 5.5796  data: 0.0001  max mem: 71357
+[13:16:50.331408] Epoch: [1]  [6340/6500]  lr: 0.000039  closs: 0.7370 (0.7626)  grad_norm: 0.3901 (0.4549)  time: 5.5821  data: 0.0001  max mem: 71357
+[13:17:46.076366] Epoch: [1]  [6350/6500]  lr: 0.000039  closs: 0.7401 (0.7627)  grad_norm: 0.3855 (0.4551)  time: 5.5786  data: 0.0001  max mem: 71357
+[13:18:41.870339] Epoch: [1]  [6360/6500]  lr: 0.000039  closs: 0.8105 (0.7628)  grad_norm: 0.3901 (0.4551)  time: 5.5768  data: 0.0002  max mem: 71357
+[13:19:37.808359] Epoch: [1]  [6370/6500]  lr: 0.000039  closs: 0.7998 (0.7628)  grad_norm: 0.4095 (0.4550)  time: 5.5865  data: 0.0002  max mem: 71357
+[13:20:33.725798] Epoch: [1]  [6380/6500]  lr: 0.000039  closs: 0.7342 (0.7626)  grad_norm: 0.3924 (0.4549)  time: 5.5926  data: 0.0002  max mem: 71357
+[13:21:29.484753] Epoch: [1]  [6390/6500]  lr: 0.000039  closs: 0.6765 (0.7626)  grad_norm: 0.3924 (0.4548)  time: 5.5836  data: 0.0002  max mem: 71357
+[13:22:25.257086] Epoch: [1]  [6400/6500]  lr: 0.000039  closs: 0.7452 (0.7627)  grad_norm: 0.3553 (0.4548)  time: 5.5764  data: 0.0002  max mem: 71357
+[13:23:21.034494] Epoch: [1]  [6410/6500]  lr: 0.000039  closs: 0.7509 (0.7627)  grad_norm: 0.3656 (0.4547)  time: 5.5774  data: 0.0001  max mem: 71357
+[13:24:17.080944] Epoch: [1]  [6420/6500]  lr: 0.000039  closs: 0.7457 (0.7627)  grad_norm: 0.3553 (0.4545)  time: 5.5911  data: 0.0001  max mem: 71357
+[13:25:12.899426] Epoch: [1]  [6430/6500]  lr: 0.000039  closs: 0.7345 (0.7628)  grad_norm: 0.3532 (0.4545)  time: 5.5931  data: 0.0001  max mem: 71357
+[13:26:08.709394] Epoch: [1]  [6440/6500]  lr: 0.000039  closs: 0.7518 (0.7628)  grad_norm: 0.3989 (0.4544)  time: 5.5813  data: 0.0001  max mem: 71357
+[13:27:04.428882] Epoch: [1]  [6450/6500]  lr: 0.000039  closs: 0.7228 (0.7627)  grad_norm: 0.3685 (0.4544)  time: 5.5763  data: 0.0001  max mem: 71357
+[13:28:00.252225] Epoch: [1]  [6460/6500]  lr: 0.000039  closs: 0.7051 (0.7626)  grad_norm: 0.3989 (0.4543)  time: 5.5770  data: 0.0001  max mem: 71357
+[13:28:56.043953] Epoch: [1]  [6470/6500]  lr: 0.000039  closs: 0.7380 (0.7626)  grad_norm: 0.4243 (0.4545)  time: 5.5807  data: 0.0001  max mem: 71357
+[13:29:51.797569] Epoch: [1]  [6480/6500]  lr: 0.000039  closs: 0.7187 (0.7625)  grad_norm: 0.3741 (0.4543)  time: 5.5772  data: 0.0001  max mem: 71357
+[13:30:47.516773] Epoch: [1]  [6490/6500]  lr: 0.000039  closs: 0.6832 (0.7624)  grad_norm: 0.4119 (0.4546)  time: 5.5735  data: 0.0001  max mem: 71357
+[13:31:38.239874] Epoch: [1] Total time: 10:04:02
+[13:31:38.277561] Averaged stats: lr: 0.000039  closs: 0.6790 (0.7624)  grad_norm: 0.3968 (0.4545)
+[13:31:38.437483] model saved
+[13:31:39.344162] optimizer saved
+[13:31:39.344627] other rank-common saved
+[13:31:39.347843] rank-specific saved
+[13:31:39.356891] log_dir: ./output_dir
+[13:31:47.519594] Epoch: [2]  [0/6500]  lr: 0.000039  closs: 0.8530 (0.8530)  time: 8.1620  data: 2.5124  max mem: 71357
+[13:32:43.288252] Epoch: [2]  [10/6500]  lr: 0.000039  closs: 0.7463 (0.7264)  grad_norm: 0.3308 (0.3256)  time: 5.8118  data: 0.2286  max mem: 71357
+[13:33:39.077409] Epoch: [2]  [20/6500]  lr: 0.000039  closs: 0.7463 (0.7621)  grad_norm: 0.3325 (0.3434)  time: 5.5778  data: 0.0001  max mem: 71357
+[13:34:34.867998] Epoch: [2]  [30/6500]  lr: 0.000039  closs: 0.7167 (0.7603)  grad_norm: 0.3547 (0.4323)  time: 5.5789  data: 0.0001  max mem: 71357
+[13:35:30.670234] Epoch: [2]  [40/6500]  lr: 0.000039  closs: 0.7167 (0.7590)  grad_norm: 0.3576 (0.4284)  time: 5.5796  data: 0.0001  max mem: 71357
+[13:36:26.526233] Epoch: [2]  [50/6500]  lr: 0.000039  closs: 0.7527 (0.7555)  grad_norm: 0.3960 (0.4176)  time: 5.5828  data: 0.0001  max mem: 71357
+[13:37:22.300149] Epoch: [2]  [60/6500]  lr: 0.000039  closs: 0.7448 (0.7551)  grad_norm: 0.4199 (0.4239)  time: 5.5814  data: 0.0001  max mem: 71357
+[13:38:18.028707] Epoch: [2]  [70/6500]  lr: 0.000039  closs: 0.7028 (0.7479)  grad_norm: 0.3856 (0.4157)  time: 5.5750  data: 0.0001  max mem: 71357
+[13:39:13.821819] Epoch: [2]  [80/6500]  lr: 0.000038  closs: 0.7377 (0.7468)  grad_norm: 0.3691 (0.4100)  time: 5.5760  data: 0.0001  max mem: 71357
+[13:40:09.683506] Epoch: [2]  [90/6500]  lr: 0.000038  closs: 0.7571 (0.7481)  grad_norm: 0.3691 (0.4179)  time: 5.5826  data: 0.0001  max mem: 71357
+[13:41:05.487770] Epoch: [2]  [100/6500]  lr: 0.000038  closs: 0.7571 (0.7515)  grad_norm: 0.3682 (0.4140)  time: 5.5832  data: 0.0001  max mem: 71357
+[13:42:01.187296] Epoch: [2]  [110/6500]  lr: 0.000038  closs: 0.7371 (0.7502)  grad_norm: 0.3937 (0.4187)  time: 5.5751  data: 0.0001  max mem: 71357
+[13:42:56.922471] Epoch: [2]  [120/6500]  lr: 0.000038  closs: 0.7371 (0.7516)  grad_norm: 0.3937 (0.4227)  time: 5.5716  data: 0.0002  max mem: 71357
+[13:43:52.820057] Epoch: [2]  [130/6500]  lr: 0.000038  closs: 0.7830 (0.7562)  grad_norm: 0.4010 (0.4249)  time: 5.5815  data: 0.0002  max mem: 71357
+[13:44:48.624177] Epoch: [2]  [140/6500]  lr: 0.000038  closs: 0.7541 (0.7543)  grad_norm: 0.4010 (0.4211)  time: 5.5850  data: 0.0001  max mem: 71357
+[13:45:44.395166] Epoch: [2]  [150/6500]  lr: 0.000038  closs: 0.6841 (0.7539)  grad_norm: 0.3672 (0.4165)  time: 5.5787  data: 0.0001  max mem: 71357
+[13:46:40.119569] Epoch: [2]  [160/6500]  lr: 0.000038  closs: 0.7270 (0.7544)  grad_norm: 0.4010 (0.4255)  time: 5.5747  data: 0.0001  max mem: 71357
+[13:47:35.890460] Epoch: [2]  [170/6500]  lr: 0.000038  closs: 0.7027 (0.7530)  grad_norm: 0.3672 (0.4220)  time: 5.5747  data: 0.0001  max mem: 71357
+[13:48:31.736860] Epoch: [2]  [180/6500]  lr: 0.000038  closs: 0.7138 (0.7518)  grad_norm: 0.3619 (0.4208)  time: 5.5808  data: 0.0001  max mem: 71357
+[13:49:27.392286] Epoch: [2]  [190/6500]  lr: 0.000038  closs: 0.7209 (0.7522)  grad_norm: 0.3753 (0.4228)  time: 5.5750  data: 0.0001  max mem: 71357
+[13:50:23.153601] Epoch: [2]  [200/6500]  lr: 0.000038  closs: 0.7229 (0.7506)  grad_norm: 0.3753 (0.4260)  time: 5.5707  data: 0.0001  max mem: 71357
+[13:51:18.968289] Epoch: [2]  [210/6500]  lr: 0.000038  closs: 0.7229 (0.7512)  grad_norm: 0.4066 (0.4268)  time: 5.5787  data: 0.0001  max mem: 71357
+[13:52:14.932241] Epoch: [2]  [220/6500]  lr: 0.000038  closs: 0.6851 (0.7505)  grad_norm: 0.4066 (0.4231)  time: 5.5888  data: 0.0001  max mem: 71357
+[13:53:10.610429] Epoch: [2]  [230/6500]  lr: 0.000038  closs: 0.7820 (0.7548)  grad_norm: 0.3845 (0.4203)  time: 5.5820  data: 0.0001  max mem: 71357
+[13:54:06.355460] Epoch: [2]  [240/6500]  lr: 0.000038  closs: 0.7820 (0.7562)  grad_norm: 0.3753 (0.4202)  time: 5.5711  data: 0.0001  max mem: 71357
+[13:55:02.130045] Epoch: [2]  [250/6500]  lr: 0.000038  closs: 0.7681 (0.7567)  grad_norm: 0.3540 (0.4181)  time: 5.5759  data: 0.0001  max mem: 71357
+[13:55:57.941222] Epoch: [2]  [260/6500]  lr: 0.000038  closs: 0.7585 (0.7572)  grad_norm: 0.3630 (0.4196)  time: 5.5792  data: 0.0001  max mem: 71357
+[13:56:53.824737] Epoch: [2]  [270/6500]  lr: 0.000038  closs: 0.7585 (0.7570)  grad_norm: 0.3699 (0.4187)  time: 5.5847  data: 0.0001  max mem: 71357
+[13:57:49.697171] Epoch: [2]  [280/6500]  lr: 0.000038  closs: 0.7405 (0.7566)  grad_norm: 0.3712 (0.4186)  time: 5.5877  data: 0.0001  max mem: 71357
+[13:58:45.437495] Epoch: [2]  [290/6500]  lr: 0.000038  closs: 0.7335 (0.7546)  grad_norm: 0.3789 (0.4163)  time: 5.5806  data: 0.0001  max mem: 71357
+[13:59:41.195413] Epoch: [2]  [300/6500]  lr: 0.000038  closs: 0.6860 (0.7536)  grad_norm: 0.3789 (0.4153)  time: 5.5748  data: 0.0001  max mem: 71357
+[14:00:37.032270] Epoch: [2]  [310/6500]  lr: 0.000038  closs: 0.7333 (0.7531)  grad_norm: 0.3733 (0.4166)  time: 5.5796  data: 0.0001  max mem: 71357
+[14:01:32.744167] Epoch: [2]  [320/6500]  lr: 0.000038  closs: 0.7333 (0.7524)  grad_norm: 0.3825 (0.4177)  time: 5.5773  data: 0.0001  max mem: 71357
+[14:02:28.443547] Epoch: [2]  [330/6500]  lr: 0.000038  closs: 0.7277 (0.7520)  grad_norm: 0.3988 (0.4181)  time: 5.5705  data: 0.0001  max mem: 71357
+[14:03:24.231708] Epoch: [2]  [340/6500]  lr: 0.000038  closs: 0.7531 (0.7512)  grad_norm: 0.3988 (0.4170)  time: 5.5743  data: 0.0001  max mem: 71357
+[14:04:20.006444] Epoch: [2]  [350/6500]  lr: 0.000038  closs: 0.7179 (0.7498)  grad_norm: 0.4000 (0.4180)  time: 5.5780  data: 0.0001  max mem: 71357
+[14:05:15.765801] Epoch: [2]  [360/6500]  lr: 0.000038  closs: 0.6998 (0.7484)  grad_norm: 0.3938 (0.4171)  time: 5.5766  data: 0.0001  max mem: 71357
+[14:06:11.530058] Epoch: [2]  [370/6500]  lr: 0.000038  closs: 0.6998 (0.7490)  grad_norm: 0.3695 (0.4163)  time: 5.5761  data: 0.0001  max mem: 71357
+[14:07:07.345190] Epoch: [2]  [380/6500]  lr: 0.000038  closs: 0.6916 (0.7479)  grad_norm: 0.3695 (0.4148)  time: 5.5789  data: 0.0001  max mem: 71357
+[14:08:03.215899] Epoch: [2]  [390/6500]  lr: 0.000038  closs: 0.7538 (0.7490)  grad_norm: 0.3527 (0.4131)  time: 5.5842  data: 0.0001  max mem: 71357
+[14:08:59.012783] Epoch: [2]  [400/6500]  lr: 0.000037  closs: 0.7534 (0.7484)  grad_norm: 0.3527 (0.4176)  time: 5.5833  data: 0.0001  max mem: 71357
+[14:09:54.780467] Epoch: [2]  [410/6500]  lr: 0.000037  closs: 0.6848 (0.7468)  grad_norm: 0.3885 (0.4227)  time: 5.5781  data: 0.0001  max mem: 71357
+[14:10:50.583135] Epoch: [2]  [420/6500]  lr: 0.000037  closs: 0.7102 (0.7481)  grad_norm: 0.4692 (0.4295)  time: 5.5784  data: 0.0001  max mem: 71357
+[14:11:46.513039] Epoch: [2]  [430/6500]  lr: 0.000037  closs: 0.7994 (0.7501)  grad_norm: 0.4802 (0.4279)  time: 5.5865  data: 0.0002  max mem: 71357
+[14:12:42.399415] Epoch: [2]  [440/6500]  lr: 0.000037  closs: 0.7675 (0.7508)  grad_norm: 0.3810 (0.4284)  time: 5.5907  data: 0.0002  max mem: 71357
+[14:13:38.199675] Epoch: [2]  [450/6500]  lr: 0.000037  closs: 0.7309 (0.7514)  grad_norm: 0.3810 (0.4298)  time: 5.5842  data: 0.0002  max mem: 71357
+[14:14:34.022210] Epoch: [2]  [460/6500]  lr: 0.000037  closs: 0.7586 (0.7526)  grad_norm: 0.3805 (0.4293)  time: 5.5810  data: 0.0001  max mem: 71357
+[14:15:29.816050] Epoch: [2]  [470/6500]  lr: 0.000037  closs: 0.7586 (0.7531)  grad_norm: 0.3805 (0.4281)  time: 5.5807  data: 0.0001  max mem: 71357
+[14:16:25.599470] Epoch: [2]  [480/6500]  lr: 0.000037  closs: 0.7866 (0.7538)  grad_norm: 0.3662 (0.4269)  time: 5.5788  data: 0.0001  max mem: 71357
+[14:17:21.518261] Epoch: [2]  [490/6500]  lr: 0.000037  closs: 0.8141 (0.7544)  grad_norm: 0.3628 (0.4270)  time: 5.5850  data: 0.0001  max mem: 71357
+[14:18:17.320636] Epoch: [2]  [500/6500]  lr: 0.000037  closs: 0.8202 (0.7549)  grad_norm: 0.3662 (0.4269)  time: 5.5860  data: 0.0001  max mem: 71357
+[14:19:12.994870] Epoch: [2]  [510/6500]  lr: 0.000037  closs: 0.7792 (0.7557)  grad_norm: 0.4105 (0.4281)  time: 5.5737  data: 0.0001  max mem: 71357
+[14:20:08.814286] Epoch: [2]  [520/6500]  lr: 0.000037  closs: 0.7314 (0.7551)  grad_norm: 0.4120 (0.4289)  time: 5.5746  data: 0.0001  max mem: 71357
+[14:21:04.579633] Epoch: [2]  [530/6500]  lr: 0.000037  closs: 0.7105 (0.7545)  grad_norm: 0.4370 (0.4294)  time: 5.5791  data: 0.0001  max mem: 71357
+[14:22:00.313354] Epoch: [2]  [540/6500]  lr: 0.000037  closs: 0.6849 (0.7527)  grad_norm: 0.4191 (0.4286)  time: 5.5749  data: 0.0001  max mem: 71357
+[14:22:56.019516] Epoch: [2]  [550/6500]  lr: 0.000037  closs: 0.7390 (0.7546)  grad_norm: 0.4063 (0.4291)  time: 5.5719  data: 0.0001  max mem: 71357
+[14:23:51.755868] Epoch: [2]  [560/6500]  lr: 0.000037  closs: 0.8036 (0.7556)  grad_norm: 0.4064 (0.4301)  time: 5.5721  data: 0.0001  max mem: 71357
+[14:24:47.612236] Epoch: [2]  [570/6500]  lr: 0.000037  closs: 0.7781 (0.7556)  grad_norm: 0.4063 (0.4297)  time: 5.5795  data: 0.0001  max mem: 71357
+[14:25:43.321070] Epoch: [2]  [580/6500]  lr: 0.000037  closs: 0.7385 (0.7558)  grad_norm: 0.4227 (0.4305)  time: 5.5781  data: 0.0001  max mem: 71357
+[14:26:39.029217] Epoch: [2]  [590/6500]  lr: 0.000037  closs: 0.8222 (0.7575)  grad_norm: 0.4322 (0.4309)  time: 5.5707  data: 0.0001  max mem: 71357
+[14:27:34.840416] Epoch: [2]  [600/6500]  lr: 0.000037  closs: 0.8283 (0.7582)  grad_norm: 0.4322 (0.4303)  time: 5.5759  data: 0.0001  max mem: 71357
+[14:28:30.571734] Epoch: [2]  [610/6500]  lr: 0.000037  closs: 0.6999 (0.7561)  grad_norm: 0.4594 (0.4356)  time: 5.5771  data: 0.0001  max mem: 71357
+[14:29:26.385644] Epoch: [2]  [620/6500]  lr: 0.000037  closs: 0.6217 (0.7547)  grad_norm: 0.4041 (0.4344)  time: 5.5772  data: 0.0001  max mem: 71357
+[14:30:22.152023] Epoch: [2]  [630/6500]  lr: 0.000037  closs: 0.6924 (0.7546)  grad_norm: 0.3903 (0.4342)  time: 5.5789  data: 0.0001  max mem: 71357
+[14:31:17.977329] Epoch: [2]  [640/6500]  lr: 0.000037  closs: 0.7096 (0.7539)  grad_norm: 0.3903 (0.4346)  time: 5.5795  data: 0.0001  max mem: 71357
+[14:32:13.809917] Epoch: [2]  [650/6500]  lr: 0.000037  closs: 0.7096 (0.7538)  grad_norm: 0.3726 (0.4336)  time: 5.5828  data: 0.0001  max mem: 71357
+[14:33:09.675270] Epoch: [2]  [660/6500]  lr: 0.000037  closs: 0.7337 (0.7539)  grad_norm: 0.3889 (0.4341)  time: 5.5848  data: 0.0001  max mem: 71357
+[14:34:05.473554] Epoch: [2]  [670/6500]  lr: 0.000037  closs: 0.6978 (0.7532)  grad_norm: 0.3686 (0.4333)  time: 5.5831  data: 0.0001  max mem: 71357
+[14:35:01.331221] Epoch: [2]  [680/6500]  lr: 0.000037  closs: 0.7212 (0.7526)  grad_norm: 0.3643 (0.4318)  time: 5.5827  data: 0.0001  max mem: 71357
+[14:35:57.063267] Epoch: [2]  [690/6500]  lr: 0.000037  closs: 0.7305 (0.7527)  grad_norm: 0.3668 (0.4318)  time: 5.5794  data: 0.0001  max mem: 71357
+[14:36:52.859279] Epoch: [2]  [700/6500]  lr: 0.000036  closs: 0.7603 (0.7529)  grad_norm: 0.3595 (0.4308)  time: 5.5763  data: 0.0001  max mem: 71357
+[14:37:48.751222] Epoch: [2]  [710/6500]  lr: 0.000036  closs: 0.7380 (0.7525)  grad_norm: 0.3590 (0.4302)  time: 5.5843  data: 0.0002  max mem: 71357
+[14:38:44.494905] Epoch: [2]  [720/6500]  lr: 0.000036  closs: 0.7349 (0.7529)  grad_norm: 0.3672 (0.4298)  time: 5.5817  data: 0.0002  max mem: 71357
+[14:39:40.195246] Epoch: [2]  [730/6500]  lr: 0.000036  closs: 0.7399 (0.7520)  grad_norm: 0.4055 (0.4299)  time: 5.5721  data: 0.0001  max mem: 71357
+[14:40:36.044045] Epoch: [2]  [740/6500]  lr: 0.000036  closs: 0.7879 (0.7533)  grad_norm: 0.3741 (0.4294)  time: 5.5774  data: 0.0001  max mem: 71357
+[14:41:31.881779] Epoch: [2]  [750/6500]  lr: 0.000036  closs: 0.8348 (0.7536)  grad_norm: 0.4074 (0.4296)  time: 5.5843  data: 0.0001  max mem: 71357
+[14:42:27.626745] Epoch: [2]  [760/6500]  lr: 0.000036  closs: 0.7557 (0.7528)  grad_norm: 0.4074 (0.4299)  time: 5.5790  data: 0.0001  max mem: 71357
+[14:43:23.427493] Epoch: [2]  [770/6500]  lr: 0.000036  closs: 0.7330 (0.7528)  grad_norm: 0.4074 (0.4302)  time: 5.5772  data: 0.0001  max mem: 71357
+[14:44:19.252177] Epoch: [2]  [780/6500]  lr: 0.000036  closs: 0.7833 (0.7537)  grad_norm: 0.4074 (0.4319)  time: 5.5812  data: 0.0001  max mem: 71357
+[14:45:15.139740] Epoch: [2]  [790/6500]  lr: 0.000036  closs: 0.7768 (0.7542)  grad_norm: 0.3707 (0.4311)  time: 5.5855  data: 0.0001  max mem: 71357
+[14:46:10.918823] Epoch: [2]  [800/6500]  lr: 0.000036  closs: 0.7081 (0.7539)  grad_norm: 0.3902 (0.4308)  time: 5.5833  data: 0.0001  max mem: 71357
+[14:47:06.736186] Epoch: [2]  [810/6500]  lr: 0.000036  closs: 0.7008 (0.7532)  grad_norm: 0.3791 (0.4306)  time: 5.5797  data: 0.0001  max mem: 71357
+[14:48:02.479681] Epoch: [2]  [820/6500]  lr: 0.000036  closs: 0.7034 (0.7527)  grad_norm: 0.3791 (0.4314)  time: 5.5779  data: 0.0002  max mem: 71357
+[14:48:58.271678] Epoch: [2]  [830/6500]  lr: 0.000036  closs: 0.6858 (0.7525)  grad_norm: 0.4105 (0.4314)  time: 5.5766  data: 0.0002  max mem: 71357
+[14:49:54.210878] Epoch: [2]  [840/6500]  lr: 0.000036  closs: 0.7823 (0.7531)  grad_norm: 0.4233 (0.4315)  time: 5.5865  data: 0.0001  max mem: 71357
+[14:50:49.920972] Epoch: [2]  [850/6500]  lr: 0.000036  closs: 0.7854 (0.7535)  grad_norm: 0.4315 (0.4319)  time: 5.5824  data: 0.0001  max mem: 71357
+[14:51:45.745245] Epoch: [2]  [860/6500]  lr: 0.000036  closs: 0.7603 (0.7538)  grad_norm: 0.4249 (0.4585)  time: 5.5766  data: 0.0002  max mem: 71357
+[14:52:41.602665] Epoch: [2]  [870/6500]  lr: 0.000036  closs: 0.7513 (0.7540)  grad_norm: 0.4205 (0.4580)  time: 5.5840  data: 0.0002  max mem: 71357
+[14:53:37.485162] Epoch: [2]  [880/6500]  lr: 0.000036  closs: 0.7341 (0.7539)  grad_norm: 0.3926 (0.4577)  time: 5.5869  data: 0.0002  max mem: 71357
+[14:54:33.281177] Epoch: [2]  [890/6500]  lr: 0.000036  closs: 0.7098 (0.7535)  grad_norm: 0.3598 (0.4576)  time: 5.5838  data: 0.0002  max mem: 71357
+[14:55:29.092517] Epoch: [2]  [900/6500]  lr: 0.000036  closs: 0.7047 (0.7527)  grad_norm: 0.3727 (0.4576)  time: 5.5802  data: 0.0002  max mem: 71357
+[14:56:24.907964] Epoch: [2]  [910/6500]  lr: 0.000036  closs: 0.7047 (0.7532)  grad_norm: 0.3870 (0.4592)  time: 5.5812  data: 0.0002  max mem: 71357
+[14:57:20.783829] Epoch: [2]  [920/6500]  lr: 0.000036  closs: 0.7544 (0.7532)  grad_norm: 0.3975 (0.4588)  time: 5.5844  data: 0.0002  max mem: 71357
+[14:58:16.764824] Epoch: [2]  [930/6500]  lr: 0.000036  closs: 0.7854 (0.7545)  grad_norm: 0.3975 (0.4591)  time: 5.5927  data: 0.0002  max mem: 71357
+[14:59:12.669502] Epoch: [2]  [940/6500]  lr: 0.000036  closs: 0.8281 (0.7545)  grad_norm: 0.3922 (0.4587)  time: 5.5942  data: 0.0002  max mem: 71357
+[15:00:08.554641] Epoch: [2]  [950/6500]  lr: 0.000036  closs: 0.7504 (0.7542)  grad_norm: 0.3922 (0.4584)  time: 5.5894  data: 0.0002  max mem: 71357
+[15:01:04.431999] Epoch: [2]  [960/6500]  lr: 0.000036  closs: 0.7345 (0.7542)  grad_norm: 0.3973 (0.4582)  time: 5.5880  data: 0.0002  max mem: 71357
+[15:02:00.303487] Epoch: [2]  [970/6500]  lr: 0.000036  closs: 0.6935 (0.7539)  grad_norm: 0.4130 (0.4579)  time: 5.5873  data: 0.0002  max mem: 71357
+[15:02:56.106192] Epoch: [2]  [980/6500]  lr: 0.000036  closs: 0.6935 (0.7538)  grad_norm: 0.3919 (0.4570)  time: 5.5836  data: 0.0002  max mem: 71357
+[15:03:51.815680] Epoch: [2]  [990/6500]  lr: 0.000036  closs: 0.6868 (0.7532)  grad_norm: 0.3815 (0.4564)  time: 5.5755  data: 0.0002  max mem: 71357
+[15:04:47.621183] Epoch: [2]  [1000/6500]  lr: 0.000035  closs: 0.6118 (0.7523)  grad_norm: 0.3674 (0.4557)  time: 5.5756  data: 0.0002  max mem: 71357
+[15:05:43.550443] Epoch: [2]  [1010/6500]  lr: 0.000035  closs: 0.6951 (0.7526)  grad_norm: 0.3764 (0.4553)  time: 5.5866  data: 0.0002  max mem: 71357
+[15:06:39.399591] Epoch: [2]  [1020/6500]  lr: 0.000035  closs: 0.7234 (0.7525)  grad_norm: 0.3825 (0.4549)  time: 5.5888  data: 0.0002  max mem: 71357
+[15:07:35.244881] Epoch: [2]  [1030/6500]  lr: 0.000035  closs: 0.7264 (0.7526)  grad_norm: 0.4198 (0.4550)  time: 5.5846  data: 0.0002  max mem: 71357
+[15:08:31.032761] Epoch: [2]  [1040/6500]  lr: 0.000035  closs: 0.7264 (0.7522)  grad_norm: 0.4198 (0.4551)  time: 5.5815  data: 0.0002  max mem: 71357
+[15:09:26.908118] Epoch: [2]  [1050/6500]  lr: 0.000035  closs: 0.7246 (0.7525)  grad_norm: 0.4107 (0.4542)  time: 5.5830  data: 0.0002  max mem: 71357
+[15:10:22.920376] Epoch: [2]  [1060/6500]  lr: 0.000035  closs: 0.7357 (0.7525)  grad_norm: 0.4051 (0.4537)  time: 5.5943  data: 0.0002  max mem: 71357
+[15:11:18.721035] Epoch: [2]  [1070/6500]  lr: 0.000035  closs: 0.7357 (0.7528)  grad_norm: 0.3653 (0.4529)  time: 5.5905  data: 0.0002  max mem: 71357
+[15:12:14.492900] Epoch: [2]  [1080/6500]  lr: 0.000035  closs: 0.7322 (0.7529)  grad_norm: 0.3654 (0.4585)  time: 5.5785  data: 0.0002  max mem: 71357
+[15:13:10.357333] Epoch: [2]  [1090/6500]  lr: 0.000035  closs: 0.7251 (0.7528)  grad_norm: 0.4259 (0.4585)  time: 5.5817  data: 0.0002  max mem: 71357
+[15:14:06.180945] Epoch: [2]  [1100/6500]  lr: 0.000035  closs: 0.7086 (0.7525)  grad_norm: 0.4282 (0.4583)  time: 5.5843  data: 0.0002  max mem: 71357
+[15:15:01.964084] Epoch: [2]  [1110/6500]  lr: 0.000035  closs: 0.6976 (0.7523)  grad_norm: 0.4664 (0.4576)  time: 5.5802  data: 0.0002  max mem: 71357
+[15:15:57.685490] Epoch: [2]  [1120/6500]  lr: 0.000035  closs: 0.7384 (0.7523)  grad_norm: 0.3756 (0.4574)  time: 5.5751  data: 0.0002  max mem: 71357
+[15:16:53.523206] Epoch: [2]  [1130/6500]  lr: 0.000035  closs: 0.7384 (0.7522)  grad_norm: 0.3756 (0.4573)  time: 5.5779  data: 0.0002  max mem: 71357
+[15:17:49.360003] Epoch: [2]  [1140/6500]  lr: 0.000035  closs: 0.7554 (0.7525)  grad_norm: 0.3694 (0.4568)  time: 5.5836  data: 0.0002  max mem: 71357
+[15:18:45.316724] Epoch: [2]  [1150/6500]  lr: 0.000035  closs: 0.7964 (0.7526)  grad_norm: 0.3718 (0.4563)  time: 5.5895  data: 0.0002  max mem: 71357
+[15:19:41.125463] Epoch: [2]  [1160/6500]  lr: 0.000035  closs: 0.7983 (0.7529)  grad_norm: 0.3694 (0.4561)  time: 5.5881  data: 0.0002  max mem: 71357
+[15:20:36.889226] Epoch: [2]  [1170/6500]  lr: 0.000035  closs: 0.7632 (0.7523)  grad_norm: 0.3718 (0.4555)  time: 5.5785  data: 0.0002  max mem: 71357
+[15:21:32.660197] Epoch: [2]  [1180/6500]  lr: 0.000035  closs: 0.6888 (0.7523)  grad_norm: 0.4002 (0.4550)  time: 5.5766  data: 0.0002  max mem: 71357
+[15:22:28.516278] Epoch: [2]  [1190/6500]  lr: 0.000035  closs: 0.6888 (0.7523)  grad_norm: 0.4274 (0.4553)  time: 5.5813  data: 0.0002  max mem: 71357
+[15:23:24.346365] Epoch: [2]  [1200/6500]  lr: 0.000035  closs: 0.7446 (0.7525)  grad_norm: 0.4099 (0.4549)  time: 5.5842  data: 0.0002  max mem: 71357
+[15:24:20.134239] Epoch: [2]  [1210/6500]  lr: 0.000035  closs: 0.6904 (0.7520)  grad_norm: 0.4104 (0.4544)  time: 5.5807  data: 0.0002  max mem: 71357
+[15:25:15.925899] Epoch: [2]  [1220/6500]  lr: 0.000035  closs: 0.7181 (0.7522)  grad_norm: 0.4152 (0.4552)  time: 5.5788  data: 0.0002  max mem: 71357
+[15:26:11.888462] Epoch: [2]  [1230/6500]  lr: 0.000035  closs: 0.7267 (0.7519)  grad_norm: 0.4083 (0.4548)  time: 5.5876  data: 0.0002  max mem: 71357
+[15:27:07.646299] Epoch: [2]  [1240/6500]  lr: 0.000035  closs: 0.6987 (0.7515)  grad_norm: 0.4104 (0.4549)  time: 5.5859  data: 0.0002  max mem: 71357
+[15:28:03.389698] Epoch: [2]  [1250/6500]  lr: 0.000035  closs: 0.6987 (0.7520)  grad_norm: 0.4244 (0.4545)  time: 5.5750  data: 0.0002  max mem: 71357
+[15:28:59.282048] Epoch: [2]  [1260/6500]  lr: 0.000035  closs: 0.6953 (0.7516)  grad_norm: 0.4244 (0.4546)  time: 5.5816  data: 0.0002  max mem: 71357
+[15:29:55.061908] Epoch: [2]  [1270/6500]  lr: 0.000035  closs: 0.7070 (0.7511)  grad_norm: 0.4485 (0.4550)  time: 5.5834  data: 0.0002  max mem: 71357
+[15:30:50.944627] Epoch: [2]  [1280/6500]  lr: 0.000035  closs: 0.7370 (0.7518)  grad_norm: 0.4485 (0.4555)  time: 5.5830  data: 0.0002  max mem: 71357
+[15:31:46.666142] Epoch: [2]  [1290/6500]  lr: 0.000034  closs: 0.7087 (0.7517)  grad_norm: 0.4504 (0.4558)  time: 5.5801  data: 0.0001  max mem: 71357
+[15:32:42.433467] Epoch: [2]  [1300/6500]  lr: 0.000034  closs: 0.7037 (0.7513)  grad_norm: 0.4323 (0.4552)  time: 5.5744  data: 0.0001  max mem: 71357
+[15:33:38.181099] Epoch: [2]  [1310/6500]  lr: 0.000034  closs: 0.7091 (0.7517)  grad_norm: 0.4323 (0.4556)  time: 5.5757  data: 0.0002  max mem: 71357
+[15:34:33.994700] Epoch: [2]  [1320/6500]  lr: 0.000034  closs: 0.7841 (0.7518)  grad_norm: 0.4367 (0.4562)  time: 5.5780  data: 0.0002  max mem: 71357
+[15:35:29.692112] Epoch: [2]  [1330/6500]  lr: 0.000034  closs: 0.7856 (0.7518)  grad_norm: 0.3952 (0.4559)  time: 5.5754  data: 0.0002  max mem: 71357
+[15:36:25.439166] Epoch: [2]  [1340/6500]  lr: 0.000034  closs: 0.7897 (0.7518)  grad_norm: 0.4837 (0.4569)  time: 5.5721  data: 0.0001  max mem: 71357
+[15:37:21.352355] Epoch: [2]  [1350/6500]  lr: 0.000034  closs: 0.7514 (0.7517)  grad_norm: 0.4169 (0.4564)  time: 5.5829  data: 0.0001  max mem: 71357
+[15:38:17.111671] Epoch: [2]  [1360/6500]  lr: 0.000034  closs: 0.7093 (0.7515)  grad_norm: 0.3850 (0.4564)  time: 5.5835  data: 0.0002  max mem: 71357
+[15:39:12.952198] Epoch: [2]  [1370/6500]  lr: 0.000034  closs: 0.7113 (0.7511)  grad_norm: 0.4112 (0.4626)  time: 5.5799  data: 0.0002  max mem: 71357
+[15:40:08.662786] Epoch: [2]  [1380/6500]  lr: 0.000034  closs: 0.7377 (0.7512)  grad_norm: 0.3917 (0.4633)  time: 5.5775  data: 0.0001  max mem: 71357
+[15:41:04.467847] Epoch: [2]  [1390/6500]  lr: 0.000034  closs: 0.7426 (0.7512)  grad_norm: 0.4009 (0.4635)  time: 5.5756  data: 0.0002  max mem: 71357
+[15:42:00.167536] Epoch: [2]  [1400/6500]  lr: 0.000034  closs: 0.7377 (0.7511)  grad_norm: 0.4512 (0.4636)  time: 5.5751  data: 0.0002  max mem: 71357
+[15:42:56.023589] Epoch: [2]  [1410/6500]  lr: 0.000034  closs: 0.7182 (0.7510)  grad_norm: 0.4512 (0.4638)  time: 5.5777  data: 0.0002  max mem: 71357
+[15:43:51.710783] Epoch: [2]  [1420/6500]  lr: 0.000034  closs: 0.7622 (0.7516)  grad_norm: 0.4390 (0.4632)  time: 5.5771  data: 0.0002  max mem: 71357
+[15:44:47.432279] Epoch: [2]  [1430/6500]  lr: 0.000034  closs: 0.8510 (0.7520)  grad_norm: 0.4332 (0.4633)  time: 5.5703  data: 0.0002  max mem: 71357
+[15:45:43.110068] Epoch: [2]  [1440/6500]  lr: 0.000034  closs: 0.8121 (0.7523)  grad_norm: 0.4311 (0.4631)  time: 5.5698  data: 0.0002  max mem: 71357
+[15:46:39.076188] Epoch: [2]  [1450/6500]  lr: 0.000034  closs: 0.7827 (0.7524)  grad_norm: 0.3811 (0.4628)  time: 5.5821  data: 0.0002  max mem: 71357
+[15:47:34.786235] Epoch: [2]  [1460/6500]  lr: 0.000034  closs: 0.7050 (0.7523)  grad_norm: 0.3816 (0.4624)  time: 5.5837  data: 0.0002  max mem: 71357
+[15:48:30.542459] Epoch: [2]  [1470/6500]  lr: 0.000034  closs: 0.7427 (0.7526)  grad_norm: 0.3891 (0.4620)  time: 5.5732  data: 0.0001  max mem: 71357
+[15:49:26.323343] Epoch: [2]  [1480/6500]  lr: 0.000034  closs: 0.7493 (0.7524)  grad_norm: 0.3891 (0.4615)  time: 5.5767  data: 0.0001  max mem: 71357
+[15:50:22.063916] Epoch: [2]  [1490/6500]  lr: 0.000034  closs: 0.7493 (0.7529)  grad_norm: 0.4173 (0.4618)  time: 5.5759  data: 0.0001  max mem: 71357
+[15:51:17.879657] Epoch: [2]  [1500/6500]  lr: 0.000034  closs: 0.7679 (0.7528)  grad_norm: 0.4173 (0.4612)  time: 5.5777  data: 0.0001  max mem: 71357
+[15:52:13.600143] Epoch: [2]  [1510/6500]  lr: 0.000034  closs: 0.7527 (0.7528)  grad_norm: 0.4107 (0.4608)  time: 5.5767  data: 0.0001  max mem: 71357
+[15:53:09.349932] Epoch: [2]  [1520/6500]  lr: 0.000034  closs: 0.7025 (0.7528)  grad_norm: 0.4178 (0.4605)  time: 5.5734  data: 0.0002  max mem: 71357
+[15:54:05.153207] Epoch: [2]  [1530/6500]  lr: 0.000034  closs: 0.6863 (0.7526)  grad_norm: 0.3977 (0.4603)  time: 5.5776  data: 0.0002  max mem: 71357
+[15:55:01.034381] Epoch: [2]  [1540/6500]  lr: 0.000034  closs: 0.6656 (0.7521)  grad_norm: 0.3977 (0.4598)  time: 5.5841  data: 0.0002  max mem: 71357
+[15:55:56.795048] Epoch: [2]  [1550/6500]  lr: 0.000034  closs: 0.6656 (0.7522)  grad_norm: 0.3977 (0.4595)  time: 5.5820  data: 0.0002  max mem: 71357
+[15:56:52.611293] Epoch: [2]  [1560/6500]  lr: 0.000034  closs: 0.7849 (0.7525)  grad_norm: 0.3977 (0.4592)  time: 5.5787  data: 0.0001  max mem: 71357
+[15:57:48.402966] Epoch: [2]  [1570/6500]  lr: 0.000034  closs: 0.7785 (0.7525)  grad_norm: 0.4062 (0.4590)  time: 5.5803  data: 0.0001  max mem: 71357
+[15:58:44.217639] Epoch: [2]  [1580/6500]  lr: 0.000033  closs: 0.7870 (0.7531)  grad_norm: 0.4106 (0.4587)  time: 5.5802  data: 0.0001  max mem: 71357
+[15:59:40.002405] Epoch: [2]  [1590/6500]  lr: 0.000033  closs: 0.7723 (0.7532)  grad_norm: 0.4159 (0.4585)  time: 5.5799  data: 0.0001  max mem: 71357
+[16:00:35.721907] Epoch: [2]  [1600/6500]  lr: 0.000033  closs: 0.6743 (0.7524)  grad_norm: 0.4302 (0.4586)  time: 5.5751  data: 0.0001  max mem: 71357
+[16:01:31.458032] Epoch: [2]  [1610/6500]  lr: 0.000033  closs: 0.6607 (0.7525)  grad_norm: 0.4363 (0.4585)  time: 5.5727  data: 0.0001  max mem: 71357
+[16:02:27.297980] Epoch: [2]  [1620/6500]  lr: 0.000033  closs: 0.7730 (0.7526)  grad_norm: 0.4302 (0.4584)  time: 5.5787  data: 0.0001  max mem: 71357
+[16:03:23.124468] Epoch: [2]  [1630/6500]  lr: 0.000033  closs: 0.7734 (0.7526)  grad_norm: 0.4404 (0.4611)  time: 5.5832  data: 0.0001  max mem: 71357
+[16:04:18.899383] Epoch: [2]  [1640/6500]  lr: 0.000033  closs: 0.7204 (0.7525)  grad_norm: 0.4069 (0.4606)  time: 5.5800  data: 0.0001  max mem: 71357
+[16:05:14.642459] Epoch: [2]  [1650/6500]  lr: 0.000033  closs: 0.7107 (0.7522)  grad_norm: 0.3977 (0.4603)  time: 5.5758  data: 0.0001  max mem: 71357
+[16:06:10.346118] Epoch: [2]  [1660/6500]  lr: 0.000033  closs: 0.6985 (0.7520)  grad_norm: 0.3828 (0.4600)  time: 5.5722  data: 0.0001  max mem: 71357
+[16:07:06.260484] Epoch: [2]  [1670/6500]  lr: 0.000033  closs: 0.7301 (0.7522)  grad_norm: 0.3594 (0.4594)  time: 5.5808  data: 0.0001  max mem: 71357
+[16:08:01.979004] Epoch: [2]  [1680/6500]  lr: 0.000033  closs: 0.7341 (0.7519)  grad_norm: 0.3670 (0.4596)  time: 5.5816  data: 0.0001  max mem: 71357
+[16:08:57.731380] Epoch: [2]  [1690/6500]  lr: 0.000033  closs: 0.7382 (0.7520)  grad_norm: 0.3670 (0.4593)  time: 5.5735  data: 0.0001  max mem: 71357
+[16:09:53.536263] Epoch: [2]  [1700/6500]  lr: 0.000033  closs: 0.7382 (0.7517)  grad_norm: 0.3817 (0.4593)  time: 5.5778  data: 0.0001  max mem: 71357
+[16:10:49.268849] Epoch: [2]  [1710/6500]  lr: 0.000033  closs: 0.7031 (0.7516)  grad_norm: 0.4069 (0.4592)  time: 5.5768  data: 0.0001  max mem: 71357
+[16:11:45.090787] Epoch: [2]  [1720/6500]  lr: 0.000033  closs: 0.7533 (0.7522)  grad_norm: 0.3817 (0.4587)  time: 5.5776  data: 0.0001  max mem: 71357
+[16:12:40.803323] Epoch: [2]  [1730/6500]  lr: 0.000033  closs: 0.7173 (0.7521)  grad_norm: 0.4069 (0.4589)  time: 5.5766  data: 0.0001  max mem: 71357
+[16:13:36.587002] Epoch: [2]  [1740/6500]  lr: 0.000033  closs: 0.6879 (0.7516)  grad_norm: 0.3828 (0.4585)  time: 5.5747  data: 0.0002  max mem: 71357
+[16:14:32.418319] Epoch: [2]  [1750/6500]  lr: 0.000033  closs: 0.6997 (0.7518)  grad_norm: 0.3686 (0.4582)  time: 5.5807  data: 0.0002  max mem: 71357
+[16:15:28.290997] Epoch: [2]  [1760/6500]  lr: 0.000033  closs: 0.7878 (0.7520)  grad_norm: 0.3828 (0.4587)  time: 5.5851  data: 0.0001  max mem: 71357
+[16:16:24.081054] Epoch: [2]  [1770/6500]  lr: 0.000033  closs: 0.8214 (0.7525)  grad_norm: 0.3686 (0.4582)  time: 5.5830  data: 0.0001  max mem: 71357
+[16:17:19.820691] Epoch: [2]  [1780/6500]  lr: 0.000033  closs: 0.7482 (0.7530)  grad_norm: 0.3871 (0.4579)  time: 5.5764  data: 0.0001  max mem: 71357
+[16:18:15.530633] Epoch: [2]  [1790/6500]  lr: 0.000033  closs: 0.7315 (0.7531)  grad_norm: 0.4017 (0.4577)  time: 5.5724  data: 0.0002  max mem: 71357
+[16:19:11.342479] Epoch: [2]  [1800/6500]  lr: 0.000033  closs: 0.7292 (0.7527)  grad_norm: 0.4017 (0.4573)  time: 5.5760  data: 0.0002  max mem: 71357
+[16:20:07.123117] Epoch: [2]  [1810/6500]  lr: 0.000033  closs: 0.7292 (0.7532)  grad_norm: 0.4125 (0.4572)  time: 5.5796  data: 0.0001  max mem: 71357
+[16:21:02.878153] Epoch: [2]  [1820/6500]  lr: 0.000033  closs: 0.8183 (0.7531)  grad_norm: 0.4125 (0.4570)  time: 5.5767  data: 0.0001  max mem: 71357
+[16:21:58.572732] Epoch: [2]  [1830/6500]  lr: 0.000033  closs: 0.7561 (0.7535)  grad_norm: 0.4260 (0.4570)  time: 5.5724  data: 0.0001  max mem: 71357
+[16:22:54.333135] Epoch: [2]  [1840/6500]  lr: 0.000033  closs: 0.7238 (0.7532)  grad_norm: 0.4551 (0.4570)  time: 5.5726  data: 0.0001  max mem: 71357
+[16:23:50.182133] Epoch: [2]  [1850/6500]  lr: 0.000033  closs: 0.7363 (0.7533)  grad_norm: 0.4112 (0.4567)  time: 5.5804  data: 0.0001  max mem: 71357
+[16:24:45.956038] Epoch: [2]  [1860/6500]  lr: 0.000032  closs: 0.7864 (0.7535)  grad_norm: 0.4024 (0.4560)  time: 5.5811  data: 0.0001  max mem: 71357
+[16:25:41.681321] Epoch: [2]  [1870/6500]  lr: 0.000032  closs: 0.7445 (0.7531)  grad_norm: 0.3907 (0.4558)  time: 5.5749  data: 0.0001  max mem: 71357
+[16:26:37.476320] Epoch: [2]  [1880/6500]  lr: 0.000032  closs: 0.7359 (0.7534)  grad_norm: 0.3884 (0.4627)  time: 5.5759  data: 0.0001  max mem: 71357
+[16:27:33.312302] Epoch: [2]  [1890/6500]  lr: 0.000032  closs: 0.6661 (0.7528)  grad_norm: 0.3852 (0.4630)  time: 5.5814  data: 0.0001  max mem: 71357
+[16:28:29.226442] Epoch: [2]  [1900/6500]  lr: 0.000032  closs: 0.7272 (0.7528)  grad_norm: 0.3907 (0.4624)  time: 5.5874  data: 0.0002  max mem: 71357
+[16:29:24.933778] Epoch: [2]  [1910/6500]  lr: 0.000032  closs: 0.7816 (0.7531)  grad_norm: 0.3694 (0.4621)  time: 5.5810  data: 0.0001  max mem: 71357
+[16:30:20.642646] Epoch: [2]  [1920/6500]  lr: 0.000032  closs: 0.7881 (0.7530)  grad_norm: 0.3694 (0.4622)  time: 5.5707  data: 0.0001  max mem: 71357
+[16:31:16.469179] Epoch: [2]  [1930/6500]  lr: 0.000032  closs: 0.7151 (0.7529)  grad_norm: 0.3642 (0.4620)  time: 5.5767  data: 0.0001  max mem: 71357
+[16:32:12.233790] Epoch: [2]  [1940/6500]  lr: 0.000032  closs: 0.7240 (0.7527)  grad_norm: 0.3498 (0.4614)  time: 5.5794  data: 0.0001  max mem: 71357
+[16:33:07.938858] Epoch: [2]  [1950/6500]  lr: 0.000032  closs: 0.7272 (0.7528)  grad_norm: 0.3489 (0.4612)  time: 5.5733  data: 0.0002  max mem: 71357
+[16:34:03.698037] Epoch: [2]  [1960/6500]  lr: 0.000032  closs: 0.7917 (0.7535)  grad_norm: 0.3489 (0.4617)  time: 5.5731  data: 0.0002  max mem: 71357
+[16:34:59.459082] Epoch: [2]  [1970/6500]  lr: 0.000032  closs: 0.7833 (0.7534)  grad_norm: 0.3513 (0.4616)  time: 5.5759  data: 0.0001  max mem: 71357
+[16:35:55.325722] Epoch: [2]  [1980/6500]  lr: 0.000032  closs: 0.7259 (0.7530)  grad_norm: 0.3869 (0.4612)  time: 5.5813  data: 0.0001  max mem: 71357
+[16:36:51.047078] Epoch: [2]  [1990/6500]  lr: 0.000032  closs: 0.7144 (0.7527)  grad_norm: 0.3811 (0.4608)  time: 5.5793  data: 0.0001  max mem: 71357
+[16:37:46.785724] Epoch: [2]  [2000/6500]  lr: 0.000032  closs: 0.7638 (0.7531)  grad_norm: 0.3811 (0.4604)  time: 5.5729  data: 0.0001  max mem: 71357
+[16:38:42.646345] Epoch: [2]  [2010/6500]  lr: 0.000032  closs: 0.8048 (0.7533)  grad_norm: 0.3741 (0.4601)  time: 5.5798  data: 0.0002  max mem: 71357
+[16:39:38.375405] Epoch: [2]  [2020/6500]  lr: 0.000032  closs: 0.7819 (0.7533)  grad_norm: 0.3729 (0.4600)  time: 5.5793  data: 0.0002  max mem: 71357
+[16:40:34.271946] Epoch: [2]  [2030/6500]  lr: 0.000032  closs: 0.7338 (0.7533)  grad_norm: 0.3709 (0.4597)  time: 5.5812  data: 0.0001  max mem: 71357
+[16:41:29.947289] Epoch: [2]  [2040/6500]  lr: 0.000032  closs: 0.7511 (0.7535)  grad_norm: 0.3709 (0.4593)  time: 5.5785  data: 0.0001  max mem: 71357
+[16:42:25.641146] Epoch: [2]  [2050/6500]  lr: 0.000032  closs: 0.7587 (0.7534)  grad_norm: 0.3732 (0.4592)  time: 5.5683  data: 0.0001  max mem: 71357
+[16:43:21.404041] Epoch: [2]  [2060/6500]  lr: 0.000032  closs: 0.7396 (0.7533)  grad_norm: 0.4092 (0.4594)  time: 5.5727  data: 0.0002  max mem: 71357
+[16:44:17.307464] Epoch: [2]  [2070/6500]  lr: 0.000032  closs: 0.7244 (0.7532)  grad_norm: 0.4281 (0.4594)  time: 5.5832  data: 0.0002  max mem: 71357
+[16:45:13.098288] Epoch: [2]  [2080/6500]  lr: 0.000032  closs: 0.7375 (0.7533)  grad_norm: 0.4281 (0.4592)  time: 5.5846  data: 0.0001  max mem: 71357
+[16:46:08.858975] Epoch: [2]  [2090/6500]  lr: 0.000032  closs: 0.7613 (0.7531)  grad_norm: 0.4259 (0.4589)  time: 5.5774  data: 0.0001  max mem: 71357
+[16:47:04.577290] Epoch: [2]  [2100/6500]  lr: 0.000032  closs: 0.7005 (0.7532)  grad_norm: 0.4049 (0.4588)  time: 5.5738  data: 0.0002  max mem: 71357
+[16:48:00.432916] Epoch: [2]  [2110/6500]  lr: 0.000032  closs: 0.8067 (0.7535)  grad_norm: 0.4049 (0.4590)  time: 5.5786  data: 0.0002  max mem: 71357
+[16:48:56.200449] Epoch: [2]  [2120/6500]  lr: 0.000032  closs: 0.7533 (0.7535)  grad_norm: 0.3871 (0.4590)  time: 5.5810  data: 0.0001  max mem: 71357
+[16:49:51.959520] Epoch: [2]  [2130/6500]  lr: 0.000032  closs: 0.7356 (0.7535)  grad_norm: 0.4249 (0.4589)  time: 5.5762  data: 0.0001  max mem: 71357
+[16:50:47.683973] Epoch: [2]  [2140/6500]  lr: 0.000032  closs: 0.7356 (0.7534)  grad_norm: 0.4513 (0.4592)  time: 5.5741  data: 0.0001  max mem: 71357
+[16:51:43.510769] Epoch: [2]  [2150/6500]  lr: 0.000031  closs: 0.7314 (0.7532)  grad_norm: 0.4494 (0.4594)  time: 5.5775  data: 0.0002  max mem: 71357
+[16:52:39.483174] Epoch: [2]  [2160/6500]  lr: 0.000031  closs: 0.7314 (0.7532)  grad_norm: 0.4494 (0.4591)  time: 5.5898  data: 0.0002  max mem: 71357
+[16:53:35.190269] Epoch: [2]  [2170/6500]  lr: 0.000031  closs: 0.7144 (0.7529)  grad_norm: 0.4850 (0.4600)  time: 5.5839  data: 0.0002  max mem: 71357
+[16:54:30.969262] Epoch: [2]  [2180/6500]  lr: 0.000031  closs: 0.6895 (0.7527)  grad_norm: 0.4358 (0.4601)  time: 5.5742  data: 0.0002  max mem: 71357
+[16:55:26.737443] Epoch: [2]  [2190/6500]  lr: 0.000031  closs: 0.8195 (0.7531)  grad_norm: 0.4794 (0.4601)  time: 5.5773  data: 0.0001  max mem: 71357
+[16:56:22.693351] Epoch: [2]  [2200/6500]  lr: 0.000031  closs: 0.8195 (0.7531)  grad_norm: 0.4243 (0.4596)  time: 5.5861  data: 0.0001  max mem: 71357
+[16:57:18.527056] Epoch: [2]  [2210/6500]  lr: 0.000031  closs: 0.7547 (0.7532)  grad_norm: 0.3739 (0.4592)  time: 5.5894  data: 0.0001  max mem: 71357
+[16:58:14.196064] Epoch: [2]  [2220/6500]  lr: 0.000031  closs: 0.8415 (0.7538)  grad_norm: 0.3739 (0.4589)  time: 5.5750  data: 0.0001  max mem: 71357
+[16:59:09.869300] Epoch: [2]  [2230/6500]  lr: 0.000031  closs: 0.8448 (0.7541)  grad_norm: 0.3739 (0.4590)  time: 5.5670  data: 0.0001  max mem: 71357
+[17:00:05.674412] Epoch: [2]  [2240/6500]  lr: 0.000031  closs: 0.6850 (0.7535)  grad_norm: 0.4223 (0.4592)  time: 5.5738  data: 0.0001  max mem: 71357
+[17:01:01.574772] Epoch: [2]  [2250/6500]  lr: 0.000031  closs: 0.6532 (0.7533)  grad_norm: 0.4223 (0.4587)  time: 5.5852  data: 0.0001  max mem: 71357
+[17:01:57.342997] Epoch: [2]  [2260/6500]  lr: 0.000031  closs: 0.7523 (0.7535)  grad_norm: 0.4039 (0.4583)  time: 5.5834  data: 0.0001  max mem: 71357
+[17:02:53.157781] Epoch: [2]  [2270/6500]  lr: 0.000031  closs: 0.7387 (0.7535)  grad_norm: 0.3563 (0.4579)  time: 5.5791  data: 0.0001  max mem: 71357
+[17:03:48.964752] Epoch: [2]  [2280/6500]  lr: 0.000031  closs: 0.7139 (0.7531)  grad_norm: 0.3563 (0.4580)  time: 5.5810  data: 0.0002  max mem: 71357
+[17:04:44.707350] Epoch: [2]  [2290/6500]  lr: 0.000031  closs: 0.7008 (0.7529)  grad_norm: 0.3878 (0.4583)  time: 5.5774  data: 0.0002  max mem: 71357
+[17:05:40.431226] Epoch: [2]  [2300/6500]  lr: 0.000031  closs: 0.7468 (0.7534)  grad_norm: 0.4123 (0.4584)  time: 5.5732  data: 0.0001  max mem: 71357
+[17:06:36.202782] Epoch: [2]  [2310/6500]  lr: 0.000031  closs: 0.7475 (0.7532)  grad_norm: 0.4419 (0.4583)  time: 5.5747  data: 0.0001  max mem: 71357
+[17:07:31.949877] Epoch: [2]  [2320/6500]  lr: 0.000031  closs: 0.7278 (0.7532)  grad_norm: 0.4416 (0.4583)  time: 5.5759  data: 0.0001  max mem: 71357
+[17:08:27.811744] Epoch: [2]  [2330/6500]  lr: 0.000031  closs: 0.7511 (0.7532)  grad_norm: 0.4123 (0.4579)  time: 5.5804  data: 0.0002  max mem: 71357
+[17:09:23.480954] Epoch: [2]  [2340/6500]  lr: 0.000031  closs: 0.6838 (0.7529)  grad_norm: 0.3687 (0.4580)  time: 5.5764  data: 0.0002  max mem: 71357
+[17:10:19.161065] Epoch: [2]  [2350/6500]  lr: 0.000031  closs: 0.6708 (0.7526)  grad_norm: 0.4002 (0.4580)  time: 5.5673  data: 0.0001  max mem: 71357
+[17:11:15.085404] Epoch: [2]  [2360/6500]  lr: 0.000031  closs: 0.7101 (0.7525)  grad_norm: 0.3574 (0.4576)  time: 5.5801  data: 0.0001  max mem: 71357
+[17:12:10.933953] Epoch: [2]  [2370/6500]  lr: 0.000031  closs: 0.6778 (0.7522)  grad_norm: 0.3789 (0.4578)  time: 5.5886  data: 0.0001  max mem: 71357
+[17:13:06.743314] Epoch: [2]  [2380/6500]  lr: 0.000031  closs: 0.6736 (0.7522)  grad_norm: 0.3789 (0.4583)  time: 5.5828  data: 0.0002  max mem: 71357
+[17:14:02.607812] Epoch: [2]  [2390/6500]  lr: 0.000031  closs: 0.7213 (0.7523)  grad_norm: 0.3567 (0.4582)  time: 5.5836  data: 0.0002  max mem: 71357
+[17:14:58.400102] Epoch: [2]  [2400/6500]  lr: 0.000031  closs: 0.7723 (0.7525)  grad_norm: 0.4134 (0.4579)  time: 5.5827  data: 0.0002  max mem: 71357
+[17:15:54.228655] Epoch: [2]  [2410/6500]  lr: 0.000031  closs: 0.7646 (0.7522)  grad_norm: 0.4042 (0.4577)  time: 5.5809  data: 0.0002  max mem: 71357
+[17:16:50.097920] Epoch: [2]  [2420/6500]  lr: 0.000030  closs: 0.7181 (0.7523)  grad_norm: 0.4134 (0.4580)  time: 5.5848  data: 0.0002  max mem: 71357
+[17:17:45.947083] Epoch: [2]  [2430/6500]  lr: 0.000030  closs: 0.7181 (0.7522)  grad_norm: 0.4134 (0.4578)  time: 5.5858  data: 0.0002  max mem: 71357
+[17:18:41.722068] Epoch: [2]  [2440/6500]  lr: 0.000030  closs: 0.6543 (0.7520)  grad_norm: 0.3858 (0.4578)  time: 5.5811  data: 0.0002  max mem: 71357
+[17:19:37.404521] Epoch: [2]  [2450/6500]  lr: 0.000030  closs: 0.7162 (0.7520)  grad_norm: 0.4227 (0.4576)  time: 5.5728  data: 0.0001  max mem: 71357
+[17:20:33.253044] Epoch: [2]  [2460/6500]  lr: 0.000030  closs: 0.7169 (0.7519)  grad_norm: 0.3806 (0.4578)  time: 5.5764  data: 0.0001  max mem: 71357
+[17:21:29.076746] Epoch: [2]  [2470/6500]  lr: 0.000030  closs: 0.7169 (0.7521)  grad_norm: 0.3771 (0.4575)  time: 5.5835  data: 0.0001  max mem: 71357
+[17:22:24.780924] Epoch: [2]  [2480/6500]  lr: 0.000030  closs: 0.7095 (0.7519)  grad_norm: 0.3961 (0.4574)  time: 5.5763  data: 0.0001  max mem: 71357
+[17:23:20.609490] Epoch: [2]  [2490/6500]  lr: 0.000030  closs: 0.6672 (0.7515)  grad_norm: 0.3961 (0.4572)  time: 5.5765  data: 0.0002  max mem: 71357
+[17:24:16.427929] Epoch: [2]  [2500/6500]  lr: 0.000030  closs: 0.6893 (0.7514)  grad_norm: 0.4139 (0.4583)  time: 5.5823  data: 0.0002  max mem: 71357
+[17:25:12.307447] Epoch: [2]  [2510/6500]  lr: 0.000030  closs: 0.7710 (0.7517)  grad_norm: 0.4430 (0.4583)  time: 5.5848  data: 0.0002  max mem: 71357
+[17:26:08.087180] Epoch: [2]  [2520/6500]  lr: 0.000030  closs: 0.7870 (0.7520)  grad_norm: 0.4261 (0.4582)  time: 5.5828  data: 0.0002  max mem: 71357
+[17:27:03.896747] Epoch: [2]  [2530/6500]  lr: 0.000030  closs: 0.7789 (0.7522)  grad_norm: 0.4251 (0.4578)  time: 5.5793  data: 0.0002  max mem: 71357
+[17:27:59.671442] Epoch: [2]  [2540/6500]  lr: 0.000030  closs: 0.7912 (0.7527)  grad_norm: 0.4251 (0.4584)  time: 5.5791  data: 0.0001  max mem: 71357
+[17:28:55.523823] Epoch: [2]  [2550/6500]  lr: 0.000030  closs: 0.7942 (0.7527)  grad_norm: 0.4170 (0.4583)  time: 5.5813  data: 0.0001  max mem: 71357
+[17:29:51.350447] Epoch: [2]  [2560/6500]  lr: 0.000030  closs: 0.7354 (0.7527)  grad_norm: 0.4065 (0.4579)  time: 5.5839  data: 0.0001  max mem: 71357
+[17:30:47.189924] Epoch: [2]  [2570/6500]  lr: 0.000030  closs: 0.7708 (0.7528)  grad_norm: 0.4130 (0.4578)  time: 5.5832  data: 0.0001  max mem: 71357
+[17:31:42.914037] Epoch: [2]  [2580/6500]  lr: 0.000030  closs: 0.7749 (0.7529)  grad_norm: 0.4057 (0.4577)  time: 5.5781  data: 0.0001  max mem: 71357
+[17:32:38.776742] Epoch: [2]  [2590/6500]  lr: 0.000030  closs: 0.7551 (0.7528)  grad_norm: 0.3975 (0.4582)  time: 5.5792  data: 0.0001  max mem: 71357
+[17:33:34.669070] Epoch: [2]  [2600/6500]  lr: 0.000030  closs: 0.7472 (0.7528)  grad_norm: 0.3958 (0.4579)  time: 5.5876  data: 0.0002  max mem: 71357
+[17:34:30.494585] Epoch: [2]  [2610/6500]  lr: 0.000030  closs: 0.7444 (0.7528)  grad_norm: 0.3791 (0.4578)  time: 5.5858  data: 0.0002  max mem: 71357
+[17:35:26.217992] Epoch: [2]  [2620/6500]  lr: 0.000030  closs: 0.7376 (0.7526)  grad_norm: 0.4022 (0.4580)  time: 5.5774  data: 0.0002  max mem: 71357
+[17:36:21.989461] Epoch: [2]  [2630/6500]  lr: 0.000030  closs: 0.6735 (0.7525)  grad_norm: 0.3811 (0.4577)  time: 5.5746  data: 0.0002  max mem: 71357
+[17:37:17.875799] Epoch: [2]  [2640/6500]  lr: 0.000030  closs: 0.7215 (0.7526)  grad_norm: 0.4032 (0.4577)  time: 5.5827  data: 0.0002  max mem: 71357
+[17:38:13.618548] Epoch: [2]  [2650/6500]  lr: 0.000030  closs: 0.7928 (0.7529)  grad_norm: 0.4032 (0.4575)  time: 5.5813  data: 0.0002  max mem: 71357
+[17:39:09.409027] Epoch: [2]  [2660/6500]  lr: 0.000030  closs: 0.7808 (0.7528)  grad_norm: 0.3757 (0.4571)  time: 5.5765  data: 0.0002  max mem: 71357
+[17:40:05.267929] Epoch: [2]  [2670/6500]  lr: 0.000030  closs: 0.6744 (0.7524)  grad_norm: 0.3697 (0.4571)  time: 5.5823  data: 0.0002  max mem: 71357
+[17:41:01.187074] Epoch: [2]  [2680/6500]  lr: 0.000030  closs: 0.6744 (0.7522)  grad_norm: 0.3697 (0.4574)  time: 5.5888  data: 0.0002  max mem: 71357
+[17:41:57.168637] Epoch: [2]  [2690/6500]  lr: 0.000030  closs: 0.7051 (0.7521)  grad_norm: 0.3697 (0.4577)  time: 5.5949  data: 0.0002  max mem: 71357
+[17:42:53.038755] Epoch: [2]  [2700/6500]  lr: 0.000029  closs: 0.6676 (0.7519)  grad_norm: 0.3712 (0.4573)  time: 5.5924  data: 0.0002  max mem: 71357
+[17:43:48.838773] Epoch: [2]  [2710/6500]  lr: 0.000029  closs: 0.6929 (0.7518)  grad_norm: 0.3499 (0.4569)  time: 5.5834  data: 0.0002  max mem: 71357
+[17:44:44.674479] Epoch: [2]  [2720/6500]  lr: 0.000029  closs: 0.7400 (0.7517)  grad_norm: 0.3712 (0.4572)  time: 5.5816  data: 0.0002  max mem: 71357
+[17:45:40.590527] Epoch: [2]  [2730/6500]  lr: 0.000029  closs: 0.7642 (0.7517)  grad_norm: 0.3458 (0.4568)  time: 5.5874  data: 0.0002  max mem: 71357
+[17:46:36.441104] Epoch: [2]  [2740/6500]  lr: 0.000029  closs: 0.7642 (0.7517)  grad_norm: 0.3688 (0.4567)  time: 5.5882  data: 0.0002  max mem: 71357
+[17:47:32.267248] Epoch: [2]  [2750/6500]  lr: 0.000029  closs: 0.7494 (0.7519)  grad_norm: 0.3896 (0.4564)  time: 5.5837  data: 0.0002  max mem: 71357
+[17:48:28.031604] Epoch: [2]  [2760/6500]  lr: 0.000029  closs: 0.7593 (0.7519)  grad_norm: 0.3661 (0.4562)  time: 5.5793  data: 0.0002  max mem: 71357
+[17:49:24.001258] Epoch: [2]  [2770/6500]  lr: 0.000029  closs: 0.7320 (0.7518)  grad_norm: 0.3661 (0.4561)  time: 5.5866  data: 0.0002  max mem: 71357
+[17:50:19.737391] Epoch: [2]  [2780/6500]  lr: 0.000029  closs: 0.7929 (0.7520)  grad_norm: 0.3975 (0.4562)  time: 5.5852  data: 0.0002  max mem: 71357
+[17:51:15.562224] Epoch: [2]  [2790/6500]  lr: 0.000029  closs: 0.7360 (0.7517)  grad_norm: 0.4177 (0.4565)  time: 5.5779  data: 0.0002  max mem: 71357
+[17:52:11.360869] Epoch: [2]  [2800/6500]  lr: 0.000029  closs: 0.7360 (0.7518)  grad_norm: 0.4483 (0.4565)  time: 5.5811  data: 0.0002  max mem: 71357
+[17:53:07.203956] Epoch: [2]  [2810/6500]  lr: 0.000029  closs: 0.7754 (0.7520)  grad_norm: 0.4330 (0.4563)  time: 5.5820  data: 0.0002  max mem: 71357
+[17:54:03.137654] Epoch: [2]  [2820/6500]  lr: 0.000029  closs: 0.7180 (0.7520)  grad_norm: 0.4330 (0.4559)  time: 5.5887  data: 0.0002  max mem: 71357
+[17:54:58.848050] Epoch: [2]  [2830/6500]  lr: 0.000029  closs: 0.7873 (0.7525)  grad_norm: 0.3781 (0.4559)  time: 5.5821  data: 0.0002  max mem: 71357
+[17:55:54.655523] Epoch: [2]  [2840/6500]  lr: 0.000029  closs: 0.7809 (0.7522)  grad_norm: 0.3781 (0.4558)  time: 5.5757  data: 0.0002  max mem: 71357
+[17:56:50.438313] Epoch: [2]  [2850/6500]  lr: 0.000029  closs: 0.7091 (0.7522)  grad_norm: 0.4116 (0.4562)  time: 5.5794  data: 0.0002  max mem: 71357
+[17:57:46.308543] Epoch: [2]  [2860/6500]  lr: 0.000029  closs: 0.7401 (0.7521)  grad_norm: 0.4116 (0.4559)  time: 5.5825  data: 0.0002  max mem: 71357
+[17:58:42.069918] Epoch: [2]  [2870/6500]  lr: 0.000029  closs: 0.7651 (0.7522)  grad_norm: 0.4116 (0.4559)  time: 5.5814  data: 0.0002  max mem: 71357
+[17:59:37.920237] Epoch: [2]  [2880/6500]  lr: 0.000029  closs: 0.6953 (0.7519)  grad_norm: 0.3913 (0.4556)  time: 5.5805  data: 0.0002  max mem: 71357
+[18:00:33.763842] Epoch: [2]  [2890/6500]  lr: 0.000029  closs: 0.6868 (0.7522)  grad_norm: 0.3913 (0.4557)  time: 5.5846  data: 0.0002  max mem: 71357
+[18:01:29.628889] Epoch: [2]  [2900/6500]  lr: 0.000029  closs: 0.6978 (0.7521)  grad_norm: 0.3972 (0.4556)  time: 5.5853  data: 0.0002  max mem: 71357
+[18:02:25.459492] Epoch: [2]  [2910/6500]  lr: 0.000029  closs: 0.7243 (0.7522)  grad_norm: 0.3972 (0.4556)  time: 5.5846  data: 0.0002  max mem: 71357
+[18:03:21.321438] Epoch: [2]  [2920/6500]  lr: 0.000029  closs: 0.8076 (0.7526)  grad_norm: 0.3893 (0.4553)  time: 5.5845  data: 0.0002  max mem: 71357
+[18:04:17.173694] Epoch: [2]  [2930/6500]  lr: 0.000029  closs: 0.8087 (0.7527)  grad_norm: 0.3737 (0.4552)  time: 5.5855  data: 0.0002  max mem: 71357
+[18:05:13.069197] Epoch: [2]  [2940/6500]  lr: 0.000029  closs: 0.7949 (0.7529)  grad_norm: 0.3737 (0.4549)  time: 5.5872  data: 0.0002  max mem: 71357
+[18:06:08.996669] Epoch: [2]  [2950/6500]  lr: 0.000029  closs: 0.7837 (0.7529)  grad_norm: 0.3866 (0.4548)  time: 5.5910  data: 0.0002  max mem: 71357
+[18:07:04.827409] Epoch: [2]  [2960/6500]  lr: 0.000029  closs: 0.7392 (0.7529)  grad_norm: 0.4097 (0.4546)  time: 5.5878  data: 0.0002  max mem: 71357
+[18:08:00.615811] Epoch: [2]  [2970/6500]  lr: 0.000029  closs: 0.6962 (0.7528)  grad_norm: 0.3931 (0.4544)  time: 5.5808  data: 0.0002  max mem: 71357
+[18:08:56.463465] Epoch: [2]  [2980/6500]  lr: 0.000028  closs: 0.6962 (0.7526)  grad_norm: 0.3931 (0.4542)  time: 5.5816  data: 0.0002  max mem: 71357
+[18:09:52.439453] Epoch: [2]  [2990/6500]  lr: 0.000028  closs: 0.6416 (0.7524)  grad_norm: 0.3694 (0.4540)  time: 5.5910  data: 0.0002  max mem: 71357
+[18:10:48.280738] Epoch: [2]  [3000/6500]  lr: 0.000028  closs: 0.7245 (0.7525)  grad_norm: 0.3694 (0.4538)  time: 5.5907  data: 0.0002  max mem: 71357
+[18:11:44.003007] Epoch: [2]  [3010/6500]  lr: 0.000028  closs: 0.7370 (0.7526)  grad_norm: 0.3666 (0.4538)  time: 5.5781  data: 0.0002  max mem: 71357
+[18:12:39.782636] Epoch: [2]  [3020/6500]  lr: 0.000028  closs: 0.7203 (0.7527)  grad_norm: 0.3794 (0.4540)  time: 5.5750  data: 0.0002  max mem: 71357
+[18:13:35.609566] Epoch: [2]  [3030/6500]  lr: 0.000028  closs: 0.7023 (0.7525)  grad_norm: 0.4202 (0.4539)  time: 5.5802  data: 0.0002  max mem: 71357
+[18:14:31.428499] Epoch: [2]  [3040/6500]  lr: 0.000028  closs: 0.7417 (0.7528)  grad_norm: 0.3936 (0.4536)  time: 5.5822  data: 0.0002  max mem: 71357
+[18:15:27.180218] Epoch: [2]  [3050/6500]  lr: 0.000028  closs: 0.7516 (0.7528)  grad_norm: 0.3936 (0.4538)  time: 5.5784  data: 0.0001  max mem: 71357
+[18:16:22.953546] Epoch: [2]  [3060/6500]  lr: 0.000028  closs: 0.7541 (0.7529)  grad_norm: 0.4185 (0.4543)  time: 5.5762  data: 0.0001  max mem: 71357
+[18:17:18.679769] Epoch: [2]  [3070/6500]  lr: 0.000028  closs: 0.7600 (0.7529)  grad_norm: 0.4185 (0.4543)  time: 5.5749  data: 0.0001  max mem: 71357
+[18:18:14.439626] Epoch: [2]  [3080/6500]  lr: 0.000028  closs: 0.7600 (0.7530)  grad_norm: 0.4330 (0.4541)  time: 5.5742  data: 0.0001  max mem: 71357
+[18:19:10.196084] Epoch: [2]  [3090/6500]  lr: 0.000028  closs: 0.7707 (0.7531)  grad_norm: 0.4097 (0.4539)  time: 5.5757  data: 0.0002  max mem: 71357
+[18:20:05.958990] Epoch: [2]  [3100/6500]  lr: 0.000028  closs: 0.7064 (0.7532)  grad_norm: 0.4097 (0.4542)  time: 5.5759  data: 0.0002  max mem: 71357
+[18:21:01.656755] Epoch: [2]  [3110/6500]  lr: 0.000028  closs: 0.7059 (0.7531)  grad_norm: 0.4056 (0.4545)  time: 5.5729  data: 0.0001  max mem: 71357
+[18:21:57.405447] Epoch: [2]  [3120/6500]  lr: 0.000028  closs: 0.7170 (0.7530)  grad_norm: 0.3960 (0.4554)  time: 5.5723  data: 0.0001  max mem: 71357
+[18:22:53.313307] Epoch: [2]  [3130/6500]  lr: 0.000028  closs: 0.7557 (0.7532)  grad_norm: 0.3887 (0.4553)  time: 5.5828  data: 0.0001  max mem: 71357
+[18:23:49.017260] Epoch: [2]  [3140/6500]  lr: 0.000028  closs: 0.8077 (0.7531)  grad_norm: 0.3960 (0.4552)  time: 5.5805  data: 0.0001  max mem: 71357
+[18:24:44.802215] Epoch: [2]  [3150/6500]  lr: 0.000028  closs: 0.7324 (0.7530)  grad_norm: 0.3656 (0.4549)  time: 5.5743  data: 0.0001  max mem: 71357
+[18:25:40.636892] Epoch: [2]  [3160/6500]  lr: 0.000028  closs: 0.6688 (0.7530)  grad_norm: 0.4030 (0.4549)  time: 5.5809  data: 0.0001  max mem: 71357
+[18:26:36.458261] Epoch: [2]  [3170/6500]  lr: 0.000028  closs: 0.7235 (0.7530)  grad_norm: 0.4011 (0.4547)  time: 5.5827  data: 0.0001  max mem: 71357
+[18:27:32.179589] Epoch: [2]  [3180/6500]  lr: 0.000028  closs: 0.7677 (0.7531)  grad_norm: 0.3832 (0.4547)  time: 5.5770  data: 0.0001  max mem: 71357
+[18:28:27.829819] Epoch: [2]  [3190/6500]  lr: 0.000028  closs: 0.7677 (0.7532)  grad_norm: 0.4011 (0.4550)  time: 5.5685  data: 0.0001  max mem: 71357
+[18:29:23.602288] Epoch: [2]  [3200/6500]  lr: 0.000028  closs: 0.6795 (0.7532)  grad_norm: 0.4011 (0.4551)  time: 5.5710  data: 0.0001  max mem: 71357
+[18:30:19.499138] Epoch: [2]  [3210/6500]  lr: 0.000028  closs: 0.6620 (0.7529)  grad_norm: 0.4624 (0.4554)  time: 5.5833  data: 0.0001  max mem: 71357
+[18:31:15.203601] Epoch: [2]  [3220/6500]  lr: 0.000028  closs: 0.7157 (0.7530)  grad_norm: 0.4624 (0.4555)  time: 5.5800  data: 0.0001  max mem: 71357
+[18:32:11.005898] Epoch: [2]  [3230/6500]  lr: 0.000028  closs: 0.8028 (0.7529)  grad_norm: 0.3974 (0.4553)  time: 5.5752  data: 0.0001  max mem: 71357
+[18:33:06.750451] Epoch: [2]  [3240/6500]  lr: 0.000028  closs: 0.7577 (0.7528)  grad_norm: 0.3974 (0.4551)  time: 5.5773  data: 0.0001  max mem: 71357
+[18:34:02.445923] Epoch: [2]  [3250/6500]  lr: 0.000028  closs: 0.6945 (0.7527)  grad_norm: 0.3940 (0.4554)  time: 5.5719  data: 0.0001  max mem: 71357
+[18:34:58.253353] Epoch: [2]  [3260/6500]  lr: 0.000027  closs: 0.6054 (0.7522)  grad_norm: 0.4058 (0.4556)  time: 5.5750  data: 0.0001  max mem: 71357
+[18:35:54.028810] Epoch: [2]  [3270/6500]  lr: 0.000027  closs: 0.6357 (0.7522)  grad_norm: 0.4058 (0.4553)  time: 5.5790  data: 0.0001  max mem: 71357
+[18:36:49.797666] Epoch: [2]  [3280/6500]  lr: 0.000027  closs: 0.7882 (0.7521)  grad_norm: 0.4058 (0.4555)  time: 5.5771  data: 0.0001  max mem: 71357
+[18:37:45.573523] Epoch: [2]  [3290/6500]  lr: 0.000027  closs: 0.7882 (0.7521)  grad_norm: 0.3778 (0.4556)  time: 5.5771  data: 0.0001  max mem: 71357
+[18:38:41.467647] Epoch: [2]  [3300/6500]  lr: 0.000027  closs: 0.7377 (0.7520)  grad_norm: 0.3660 (0.4553)  time: 5.5834  data: 0.0001  max mem: 71357
+[18:39:37.255941] Epoch: [2]  [3310/6500]  lr: 0.000027  closs: 0.6829 (0.7519)  grad_norm: 0.3693 (0.4554)  time: 5.5840  data: 0.0001  max mem: 71357
+[18:40:33.026676] Epoch: [2]  [3320/6500]  lr: 0.000027  closs: 0.7501 (0.7518)  grad_norm: 0.3773 (0.4556)  time: 5.5778  data: 0.0001  max mem: 71357
+[18:41:28.790780] Epoch: [2]  [3330/6500]  lr: 0.000027  closs: 0.7565 (0.7519)  grad_norm: 0.3885 (0.4554)  time: 5.5766  data: 0.0001  max mem: 71357
+[18:42:24.526741] Epoch: [2]  [3340/6500]  lr: 0.000027  closs: 0.7717 (0.7522)  grad_norm: 0.4094 (0.4555)  time: 5.5749  data: 0.0001  max mem: 71357
+[18:43:20.332717] Epoch: [2]  [3350/6500]  lr: 0.000027  closs: 0.7165 (0.7520)  grad_norm: 0.4280 (0.4556)  time: 5.5770  data: 0.0001  max mem: 71357
+[18:44:16.086465] Epoch: [2]  [3360/6500]  lr: 0.000027  closs: 0.6851 (0.7518)  grad_norm: 0.4280 (0.4554)  time: 5.5779  data: 0.0001  max mem: 71357
+[18:45:11.852035] Epoch: [2]  [3370/6500]  lr: 0.000027  closs: 0.6957 (0.7519)  grad_norm: 0.4283 (0.4553)  time: 5.5759  data: 0.0001  max mem: 71357
+[18:46:07.516148] Epoch: [2]  [3380/6500]  lr: 0.000027  closs: 0.7751 (0.7520)  grad_norm: 0.4299 (0.4558)  time: 5.5714  data: 0.0001  max mem: 71357
+[18:47:03.306747] Epoch: [2]  [3390/6500]  lr: 0.000027  closs: 0.7488 (0.7519)  grad_norm: 0.4299 (0.4556)  time: 5.5727  data: 0.0001  max mem: 71357
+[18:47:59.053434] Epoch: [2]  [3400/6500]  lr: 0.000027  closs: 0.7401 (0.7521)  grad_norm: 0.4102 (0.4552)  time: 5.5768  data: 0.0001  max mem: 71357
+[18:48:54.740746] Epoch: [2]  [3410/6500]  lr: 0.000027  closs: 0.7448 (0.7521)  grad_norm: 0.4299 (0.4553)  time: 5.5716  data: 0.0001  max mem: 71357
+[18:49:50.560422] Epoch: [2]  [3420/6500]  lr: 0.000027  closs: 0.7453 (0.7521)  grad_norm: 0.3657 (0.4550)  time: 5.5753  data: 0.0001  max mem: 71357
+[18:50:46.435930] Epoch: [2]  [3430/6500]  lr: 0.000027  closs: 0.7453 (0.7521)  grad_norm: 0.3689 (0.4550)  time: 5.5847  data: 0.0001  max mem: 71357
+[18:51:42.267551] Epoch: [2]  [3440/6500]  lr: 0.000027  closs: 0.7437 (0.7523)  grad_norm: 0.4137 (0.4552)  time: 5.5852  data: 0.0001  max mem: 71357
+[18:52:38.039434] Epoch: [2]  [3450/6500]  lr: 0.000027  closs: 0.7048 (0.7521)  grad_norm: 0.4042 (0.4552)  time: 5.5801  data: 0.0001  max mem: 71357
+[18:53:33.817847] Epoch: [2]  [3460/6500]  lr: 0.000027  closs: 0.7392 (0.7522)  grad_norm: 0.4137 (0.4551)  time: 5.5774  data: 0.0001  max mem: 71357
+[18:54:29.618361] Epoch: [2]  [3470/6500]  lr: 0.000027  closs: 0.7650 (0.7522)  grad_norm: 0.4506 (0.4550)  time: 5.5788  data: 0.0001  max mem: 71357
+[18:55:25.443455] Epoch: [2]  [3480/6500]  lr: 0.000027  closs: 0.7204 (0.7520)  grad_norm: 0.3884 (0.4548)  time: 5.5812  data: 0.0001  max mem: 71357
+[18:56:21.227026] Epoch: [2]  [3490/6500]  lr: 0.000027  closs: 0.7204 (0.7519)  grad_norm: 0.3710 (0.4546)  time: 5.5803  data: 0.0001  max mem: 71357
+[18:57:16.999656] Epoch: [2]  [3500/6500]  lr: 0.000027  closs: 0.7337 (0.7519)  grad_norm: 0.3694 (0.4546)  time: 5.5777  data: 0.0001  max mem: 71357
+[18:58:12.821955] Epoch: [2]  [3510/6500]  lr: 0.000027  closs: 0.7021 (0.7521)  grad_norm: 0.3694 (0.4544)  time: 5.5797  data: 0.0001  max mem: 71357
+[18:59:08.653591] Epoch: [2]  [3520/6500]  lr: 0.000027  closs: 0.7021 (0.7520)  grad_norm: 0.3993 (0.4544)  time: 5.5826  data: 0.0001  max mem: 71357
+[19:00:04.373591] Epoch: [2]  [3530/6500]  lr: 0.000026  closs: 0.6680 (0.7517)  grad_norm: 0.4037 (0.4543)  time: 5.5775  data: 0.0001  max mem: 71357
+[19:01:00.095231] Epoch: [2]  [3540/6500]  lr: 0.000026  closs: 0.6592 (0.7515)  grad_norm: 0.3815 (0.4543)  time: 5.5720  data: 0.0001  max mem: 71357
+[19:01:55.783676] Epoch: [2]  [3550/6500]  lr: 0.000026  closs: 0.7683 (0.7518)  grad_norm: 0.3815 (0.4542)  time: 5.5704  data: 0.0001  max mem: 71357
+[19:02:51.613530] Epoch: [2]  [3560/6500]  lr: 0.000026  closs: 0.7802 (0.7518)  grad_norm: 0.3848 (0.4543)  time: 5.5758  data: 0.0001  max mem: 71357
+[19:03:47.362602] Epoch: [2]  [3570/6500]  lr: 0.000026  closs: 0.7255 (0.7519)  grad_norm: 0.4178 (0.4544)  time: 5.5789  data: 0.0001  max mem: 71357
+[19:04:43.065929] Epoch: [2]  [3580/6500]  lr: 0.000026  closs: 0.7179 (0.7519)  grad_norm: 0.4329 (0.4545)  time: 5.5726  data: 0.0001  max mem: 71357
+[19:05:38.869463] Epoch: [2]  [3590/6500]  lr: 0.000026  closs: 0.7179 (0.7518)  grad_norm: 0.4329 (0.4543)  time: 5.5752  data: 0.0001  max mem: 71357
+[19:06:34.613911] Epoch: [2]  [3600/6500]  lr: 0.000026  closs: 0.7426 (0.7519)  grad_norm: 0.4353 (0.4545)  time: 5.5773  data: 0.0001  max mem: 71357
+[19:07:30.425204] Epoch: [2]  [3610/6500]  lr: 0.000026  closs: 0.7318 (0.7520)  grad_norm: 0.4175 (0.4543)  time: 5.5777  data: 0.0001  max mem: 71357
+[19:08:26.219365] Epoch: [2]  [3620/6500]  lr: 0.000026  closs: 0.7082 (0.7519)  grad_norm: 0.4031 (0.4542)  time: 5.5802  data: 0.0001  max mem: 71357
+[19:09:21.945457] Epoch: [2]  [3630/6500]  lr: 0.000026  closs: 0.7118 (0.7519)  grad_norm: 0.4031 (0.4574)  time: 5.5759  data: 0.0001  max mem: 71357
+[19:10:17.768386] Epoch: [2]  [3640/6500]  lr: 0.000026  closs: 0.7003 (0.7517)  grad_norm: 0.4097 (0.4575)  time: 5.5774  data: 0.0001  max mem: 71357
+[19:11:13.634197] Epoch: [2]  [3650/6500]  lr: 0.000026  closs: 0.6395 (0.7513)  grad_norm: 0.4272 (0.4582)  time: 5.5843  data: 0.0001  max mem: 71357
+[19:12:09.273361] Epoch: [2]  [3660/6500]  lr: 0.000026  closs: 0.5753 (0.7509)  grad_norm: 0.4639 (0.4585)  time: 5.5751  data: 0.0001  max mem: 71357
+[19:13:05.018929] Epoch: [2]  [3670/6500]  lr: 0.000026  closs: 0.7002 (0.7509)  grad_norm: 0.4504 (0.4583)  time: 5.5691  data: 0.0001  max mem: 71357
+[19:14:00.765347] Epoch: [2]  [3680/6500]  lr: 0.000026  closs: 0.7512 (0.7510)  grad_norm: 0.3919 (0.4580)  time: 5.5745  data: 0.0001  max mem: 71357
+[19:14:56.592826] Epoch: [2]  [3690/6500]  lr: 0.000026  closs: 0.7369 (0.7511)  grad_norm: 0.4039 (0.4580)  time: 5.5786  data: 0.0001  max mem: 71357
+[19:15:52.353357] Epoch: [2]  [3700/6500]  lr: 0.000026  closs: 0.7269 (0.7511)  grad_norm: 0.4066 (0.4582)  time: 5.5793  data: 0.0001  max mem: 71357
+[19:16:48.075930] Epoch: [2]  [3710/6500]  lr: 0.000026  closs: 0.7604 (0.7514)  grad_norm: 0.4211 (0.4584)  time: 5.5741  data: 0.0001  max mem: 71357
+[19:17:43.807793] Epoch: [2]  [3720/6500]  lr: 0.000026  closs: 0.7592 (0.7515)  grad_norm: 0.4212 (0.4582)  time: 5.5726  data: 0.0001  max mem: 71357
+[19:18:39.589533] Epoch: [2]  [3730/6500]  lr: 0.000026  closs: 0.7153 (0.7513)  grad_norm: 0.4276 (0.4583)  time: 5.5756  data: 0.0001  max mem: 71357
+[19:19:35.517625] Epoch: [2]  [3740/6500]  lr: 0.000026  closs: 0.6791 (0.7514)  grad_norm: 0.4190 (0.4583)  time: 5.5854  data: 0.0001  max mem: 71357
+[19:20:31.333293] Epoch: [2]  [3750/6500]  lr: 0.000026  closs: 0.7668 (0.7515)  grad_norm: 0.3620 (0.4579)  time: 5.5871  data: 0.0001  max mem: 71357
+[19:21:27.133335] Epoch: [2]  [3760/6500]  lr: 0.000026  closs: 0.7307 (0.7514)  grad_norm: 0.3646 (0.4580)  time: 5.5807  data: 0.0001  max mem: 71357
+[19:22:22.832460] Epoch: [2]  [3770/6500]  lr: 0.000026  closs: 0.6995 (0.7513)  grad_norm: 0.3620 (0.4578)  time: 5.5749  data: 0.0001  max mem: 71357
+[19:23:18.596759] Epoch: [2]  [3780/6500]  lr: 0.000026  closs: 0.6823 (0.7511)  grad_norm: 0.3688 (0.4578)  time: 5.5731  data: 0.0001  max mem: 71357
+[19:24:14.511863] Epoch: [2]  [3790/6500]  lr: 0.000026  closs: 0.6904 (0.7510)  grad_norm: 0.4127 (0.4578)  time: 5.5838  data: 0.0002  max mem: 71357
+[19:25:10.290788] Epoch: [2]  [3800/6500]  lr: 0.000026  closs: 0.7732 (0.7511)  grad_norm: 0.3918 (0.4576)  time: 5.5846  data: 0.0002  max mem: 71357
+[19:26:06.038060] Epoch: [2]  [3810/6500]  lr: 0.000025  closs: 0.8008 (0.7514)  grad_norm: 0.4030 (0.4576)  time: 5.5762  data: 0.0001  max mem: 71357
+[19:27:01.862060] Epoch: [2]  [3820/6500]  lr: 0.000025  closs: 0.7921 (0.7515)  grad_norm: 0.4030 (0.4575)  time: 5.5785  data: 0.0001  max mem: 71357
+[19:27:57.761778] Epoch: [2]  [3830/6500]  lr: 0.000025  closs: 0.7190 (0.7514)  grad_norm: 0.4133 (0.4574)  time: 5.5861  data: 0.0001  max mem: 71357
+[19:28:53.629998] Epoch: [2]  [3840/6500]  lr: 0.000025  closs: 0.7989 (0.7516)  grad_norm: 0.4133 (0.4576)  time: 5.5883  data: 0.0001  max mem: 71357
+[19:29:49.375750] Epoch: [2]  [3850/6500]  lr: 0.000025  closs: 0.7985 (0.7515)  grad_norm: 0.4121 (0.4574)  time: 5.5806  data: 0.0001  max mem: 71357
+[19:30:45.134749] Epoch: [2]  [3860/6500]  lr: 0.000025  closs: 0.7189 (0.7515)  grad_norm: 0.4118 (0.4572)  time: 5.5751  data: 0.0001  max mem: 71357
+[19:31:41.021523] Epoch: [2]  [3870/6500]  lr: 0.000025  closs: 0.7189 (0.7515)  grad_norm: 0.3507 (0.4573)  time: 5.5822  data: 0.0001  max mem: 71357
+[19:32:36.805285] Epoch: [2]  [3880/6500]  lr: 0.000025  closs: 0.7183 (0.7515)  grad_norm: 0.3621 (0.4572)  time: 5.5835  data: 0.0001  max mem: 71357
+[19:33:32.498955] Epoch: [2]  [3890/6500]  lr: 0.000025  closs: 0.7183 (0.7517)  grad_norm: 0.4034 (0.4573)  time: 5.5738  data: 0.0001  max mem: 71357
+[19:34:28.271499] Epoch: [2]  [3900/6500]  lr: 0.000025  closs: 0.7171 (0.7516)  grad_norm: 0.4034 (0.4572)  time: 5.5732  data: 0.0001  max mem: 71357
+[19:35:24.068345] Epoch: [2]  [3910/6500]  lr: 0.000025  closs: 0.7171 (0.7516)  grad_norm: 0.4155 (0.4572)  time: 5.5784  data: 0.0001  max mem: 71357
+[19:36:19.846821] Epoch: [2]  [3920/6500]  lr: 0.000025  closs: 0.7395 (0.7517)  grad_norm: 0.4235 (0.4572)  time: 5.5787  data: 0.0001  max mem: 71357
+[19:37:15.586916] Epoch: [2]  [3930/6500]  lr: 0.000025  closs: 0.7115 (0.7516)  grad_norm: 0.4147 (0.4570)  time: 5.5758  data: 0.0001  max mem: 71357
+[19:38:11.257908] Epoch: [2]  [3940/6500]  lr: 0.000025  closs: 0.7486 (0.7519)  grad_norm: 0.4114 (0.4569)  time: 5.5704  data: 0.0001  max mem: 71357
+[19:39:07.044937] Epoch: [2]  [3950/6500]  lr: 0.000025  closs: 0.7479 (0.7518)  grad_norm: 0.3677 (0.4569)  time: 5.5728  data: 0.0002  max mem: 71357
+[19:40:02.943403] Epoch: [2]  [3960/6500]  lr: 0.000025  closs: 0.7034 (0.7517)  grad_norm: 0.3595 (0.4567)  time: 5.5842  data: 0.0002  max mem: 71357
+[19:40:58.634133] Epoch: [2]  [3970/6500]  lr: 0.000025  closs: 0.6918 (0.7517)  grad_norm: 0.3632 (0.4566)  time: 5.5794  data: 0.0001  max mem: 71357
+[19:41:54.409209] Epoch: [2]  [3980/6500]  lr: 0.000025  closs: 0.6974 (0.7517)  grad_norm: 0.3576 (0.4563)  time: 5.5732  data: 0.0001  max mem: 71357
+[19:42:50.131763] Epoch: [2]  [3990/6500]  lr: 0.000025  closs: 0.7662 (0.7519)  grad_norm: 0.3603 (0.4562)  time: 5.5748  data: 0.0001  max mem: 71357
+[19:43:45.912721] Epoch: [2]  [4000/6500]  lr: 0.000025  closs: 0.7659 (0.7518)  grad_norm: 0.3586 (0.4560)  time: 5.5751  data: 0.0001  max mem: 71357
+[19:44:41.737617] Epoch: [2]  [4010/6500]  lr: 0.000025  closs: 0.6833 (0.7515)  grad_norm: 0.3591 (0.4560)  time: 5.5802  data: 0.0001  max mem: 71357
+[19:45:37.533836] Epoch: [2]  [4020/6500]  lr: 0.000025  closs: 0.6835 (0.7515)  grad_norm: 0.3779 (0.4559)  time: 5.5810  data: 0.0001  max mem: 71357
+[19:46:33.370425] Epoch: [2]  [4030/6500]  lr: 0.000025  closs: 0.7941 (0.7518)  grad_norm: 0.3591 (0.4558)  time: 5.5816  data: 0.0001  max mem: 71357
+[19:47:29.110704] Epoch: [2]  [4040/6500]  lr: 0.000025  closs: 0.8463 (0.7520)  grad_norm: 0.4270 (0.4560)  time: 5.5788  data: 0.0001  max mem: 71357
+[19:48:24.981624] Epoch: [2]  [4050/6500]  lr: 0.000025  closs: 0.7672 (0.7521)  grad_norm: 0.4270 (0.4562)  time: 5.5805  data: 0.0001  max mem: 71357
+[19:49:20.791100] Epoch: [2]  [4060/6500]  lr: 0.000025  closs: 0.7486 (0.7521)  grad_norm: 0.4354 (0.4561)  time: 5.5839  data: 0.0001  max mem: 71357
+[19:50:16.553534] Epoch: [2]  [4070/6500]  lr: 0.000025  closs: 0.7918 (0.7525)  grad_norm: 0.4397 (0.4561)  time: 5.5785  data: 0.0001  max mem: 71357
+[19:51:12.298445] Epoch: [2]  [4080/6500]  lr: 0.000025  closs: 0.8232 (0.7524)  grad_norm: 0.4222 (0.4559)  time: 5.5753  data: 0.0001  max mem: 71357
+[19:52:08.214197] Epoch: [2]  [4090/6500]  lr: 0.000024  closs: 0.6875 (0.7523)  grad_norm: 0.3943 (0.4558)  time: 5.5829  data: 0.0001  max mem: 71357
+[19:53:03.923183] Epoch: [2]  [4100/6500]  lr: 0.000024  closs: 0.6875 (0.7524)  grad_norm: 0.3943 (0.4558)  time: 5.5811  data: 0.0001  max mem: 71357
+[19:53:59.656143] Epoch: [2]  [4110/6500]  lr: 0.000024  closs: 0.7027 (0.7524)  grad_norm: 0.3542 (0.4555)  time: 5.5720  data: 0.0001  max mem: 71357
+[19:54:55.392527] Epoch: [2]  [4120/6500]  lr: 0.000024  closs: 0.7027 (0.7524)  grad_norm: 0.3617 (0.4554)  time: 5.5734  data: 0.0001  max mem: 71357
+[19:55:51.121119] Epoch: [2]  [4130/6500]  lr: 0.000024  closs: 0.7443 (0.7525)  grad_norm: 0.4015 (0.4557)  time: 5.5731  data: 0.0001  max mem: 71357
+[19:56:47.017662] Epoch: [2]  [4140/6500]  lr: 0.000024  closs: 0.6968 (0.7523)  grad_norm: 0.4069 (0.4558)  time: 5.5812  data: 0.0001  max mem: 71357
+[19:57:42.833295] Epoch: [2]  [4150/6500]  lr: 0.000024  closs: 0.6998 (0.7525)  grad_norm: 0.4234 (0.4557)  time: 5.5855  data: 0.0001  max mem: 71357
+[19:58:38.652202] Epoch: [2]  [4160/6500]  lr: 0.000024  closs: 0.7783 (0.7526)  grad_norm: 0.4234 (0.4561)  time: 5.5816  data: 0.0001  max mem: 71357
+[19:59:34.468532] Epoch: [2]  [4170/6500]  lr: 0.000024  closs: 0.7338 (0.7524)  grad_norm: 0.4316 (0.4561)  time: 5.5816  data: 0.0002  max mem: 71357
+[20:00:30.388864] Epoch: [2]  [4180/6500]  lr: 0.000024  closs: 0.6694 (0.7522)  grad_norm: 0.3939 (0.4559)  time: 5.5866  data: 0.0002  max mem: 71357
+[20:01:26.113858] Epoch: [2]  [4190/6500]  lr: 0.000024  closs: 0.6837 (0.7522)  grad_norm: 0.3998 (0.4559)  time: 5.5820  data: 0.0001  max mem: 71357
+[20:02:21.829528] Epoch: [2]  [4200/6500]  lr: 0.000024  closs: 0.6585 (0.7517)  grad_norm: 0.3751 (0.4557)  time: 5.5720  data: 0.0001  max mem: 71357
+[20:03:17.604842] Epoch: [2]  [4210/6500]  lr: 0.000024  closs: 0.6701 (0.7517)  grad_norm: 0.3748 (0.4598)  time: 5.5745  data: 0.0001  max mem: 71357
+[20:04:13.477797] Epoch: [2]  [4220/6500]  lr: 0.000024  closs: 0.7236 (0.7518)  grad_norm: 0.3748 (0.4596)  time: 5.5823  data: 0.0001  max mem: 71357
+[20:05:09.384343] Epoch: [2]  [4230/6500]  lr: 0.000024  closs: 0.7549 (0.7519)  grad_norm: 0.3470 (0.4595)  time: 5.5889  data: 0.0001  max mem: 71357
+[20:06:05.120629] Epoch: [2]  [4240/6500]  lr: 0.000024  closs: 0.8058 (0.7521)  grad_norm: 0.3470 (0.4594)  time: 5.5820  data: 0.0001  max mem: 71357
+[20:07:00.817213] Epoch: [2]  [4250/6500]  lr: 0.000024  closs: 0.7658 (0.7519)  grad_norm: 0.3538 (0.4594)  time: 5.5716  data: 0.0001  max mem: 71357
+[20:07:56.580927] Epoch: [2]  [4260/6500]  lr: 0.000024  closs: 0.6630 (0.7518)  grad_norm: 0.3951 (0.4593)  time: 5.5729  data: 0.0001  max mem: 71357
+[20:08:52.369538] Epoch: [2]  [4270/6500]  lr: 0.000024  closs: 0.6919 (0.7519)  grad_norm: 0.3951 (0.4592)  time: 5.5775  data: 0.0001  max mem: 71357
+[20:09:48.116804] Epoch: [2]  [4280/6500]  lr: 0.000024  closs: 0.7422 (0.7519)  grad_norm: 0.3951 (0.4593)  time: 5.5767  data: 0.0001  max mem: 71357
+[20:10:43.991384] Epoch: [2]  [4290/6500]  lr: 0.000024  closs: 0.7422 (0.7519)  grad_norm: 0.3758 (0.4593)  time: 5.5810  data: 0.0001  max mem: 71357
+[20:11:39.673183] Epoch: [2]  [4300/6500]  lr: 0.000024  closs: 0.6998 (0.7519)  grad_norm: 0.3586 (0.4592)  time: 5.5777  data: 0.0001  max mem: 71357
+[20:12:35.691999] Epoch: [2]  [4310/6500]  lr: 0.000024  closs: 0.6928 (0.7518)  grad_norm: 0.3542 (0.4589)  time: 5.5849  data: 0.0001  max mem: 71357
+[20:13:31.414107] Epoch: [2]  [4320/6500]  lr: 0.000024  closs: 0.7853 (0.7521)  grad_norm: 0.3542 (0.4591)  time: 5.5870  data: 0.0001  max mem: 71357
+[20:14:27.153355] Epoch: [2]  [4330/6500]  lr: 0.000024  closs: 0.7853 (0.7521)  grad_norm: 0.3800 (0.4590)  time: 5.5730  data: 0.0001  max mem: 71357
+[20:15:22.884207] Epoch: [2]  [4340/6500]  lr: 0.000024  closs: 0.7557 (0.7522)  grad_norm: 0.3681 (0.4590)  time: 5.5734  data: 0.0001  max mem: 71357
+[20:16:18.717580] Epoch: [2]  [4350/6500]  lr: 0.000024  closs: 0.7819 (0.7523)  grad_norm: 0.3800 (0.4589)  time: 5.5781  data: 0.0001  max mem: 71357
+[20:17:14.490059] Epoch: [2]  [4360/6500]  lr: 0.000023  closs: 0.7472 (0.7523)  grad_norm: 0.4156 (0.4590)  time: 5.5802  data: 0.0001  max mem: 71357
+[20:18:10.275532] Epoch: [2]  [4370/6500]  lr: 0.000023  closs: 0.7444 (0.7523)  grad_norm: 0.3810 (0.4588)  time: 5.5778  data: 0.0001  max mem: 71357
+[20:19:06.010266] Epoch: [2]  [4380/6500]  lr: 0.000023  closs: 0.7763 (0.7524)  grad_norm: 0.4355 (0.4589)  time: 5.5759  data: 0.0001  max mem: 71357
+[20:20:01.808150] Epoch: [2]  [4390/6500]  lr: 0.000023  closs: 0.7244 (0.7523)  grad_norm: 0.4355 (0.4589)  time: 5.5766  data: 0.0001  max mem: 71357
+[20:20:57.625348] Epoch: [2]  [4400/6500]  lr: 0.000023  closs: 0.7288 (0.7523)  grad_norm: 0.3992 (0.4587)  time: 5.5807  data: 0.0001  max mem: 71357
+[20:21:53.324472] Epoch: [2]  [4410/6500]  lr: 0.000023  closs: 0.7448 (0.7522)  grad_norm: 0.4006 (0.4586)  time: 5.5757  data: 0.0001  max mem: 71357
+[20:22:49.049941] Epoch: [2]  [4420/6500]  lr: 0.000023  closs: 0.7451 (0.7522)  grad_norm: 0.4197 (0.4587)  time: 5.5712  data: 0.0001  max mem: 71357
+[20:23:44.748732] Epoch: [2]  [4430/6500]  lr: 0.000023  closs: 0.7624 (0.7522)  grad_norm: 0.3477 (0.4584)  time: 5.5711  data: 0.0001  max mem: 71357
+[20:24:40.497376] Epoch: [2]  [4440/6500]  lr: 0.000023  closs: 0.7418 (0.7522)  grad_norm: 0.4209 (0.4585)  time: 5.5723  data: 0.0001  max mem: 71357
+[20:25:36.293414] Epoch: [2]  [4450/6500]  lr: 0.000023  closs: 0.7588 (0.7522)  grad_norm: 0.3757 (0.4587)  time: 5.5772  data: 0.0001  max mem: 71357
+[20:26:31.999472] Epoch: [2]  [4460/6500]  lr: 0.000023  closs: 0.7787 (0.7523)  grad_norm: 0.3697 (0.4586)  time: 5.5750  data: 0.0001  max mem: 71357
+[20:27:27.726368] Epoch: [2]  [4470/6500]  lr: 0.000023  closs: 0.8265 (0.7524)  grad_norm: 0.3947 (0.4585)  time: 5.5715  data: 0.0001  max mem: 71357
+[20:28:23.380473] Epoch: [2]  [4480/6500]  lr: 0.000023  closs: 0.7835 (0.7524)  grad_norm: 0.3890 (0.4584)  time: 5.5689  data: 0.0001  max mem: 71357
+[20:29:19.182199] Epoch: [2]  [4490/6500]  lr: 0.000023  closs: 0.7434 (0.7525)  grad_norm: 0.4197 (0.4584)  time: 5.5727  data: 0.0001  max mem: 71357
+[20:30:14.955737] Epoch: [2]  [4500/6500]  lr: 0.000023  closs: 0.7503 (0.7527)  grad_norm: 0.4197 (0.4583)  time: 5.5787  data: 0.0001  max mem: 71357
+[20:31:10.618074] Epoch: [2]  [4510/6500]  lr: 0.000023  closs: 0.7323 (0.7525)  grad_norm: 0.3890 (0.4581)  time: 5.5717  data: 0.0001  max mem: 71357
+[20:32:06.333679] Epoch: [2]  [4520/6500]  lr: 0.000023  closs: 0.7582 (0.7526)  grad_norm: 0.4145 (0.4580)  time: 5.5688  data: 0.0001  max mem: 71357
+[20:33:02.124341] Epoch: [2]  [4530/6500]  lr: 0.000023  closs: 0.7793 (0.7527)  grad_norm: 0.3605 (0.4579)  time: 5.5752  data: 0.0001  max mem: 71357
+[20:33:57.818097] Epoch: [2]  [4540/6500]  lr: 0.000023  closs: 0.7636 (0.7527)  grad_norm: 0.3769 (0.4582)  time: 5.5741  data: 0.0001  max mem: 71357
+[20:34:53.515671] Epoch: [2]  [4550/6500]  lr: 0.000023  closs: 0.7089 (0.7527)  grad_norm: 0.3769 (0.4582)  time: 5.5695  data: 0.0001  max mem: 71357
+[20:35:49.117517] Epoch: [2]  [4560/6500]  lr: 0.000023  closs: 0.7042 (0.7527)  grad_norm: 0.3902 (0.4581)  time: 5.5649  data: 0.0001  max mem: 71357
+[20:36:44.782322] Epoch: [2]  [4570/6500]  lr: 0.000023  closs: 0.7926 (0.7528)  grad_norm: 0.4067 (0.4581)  time: 5.5632  data: 0.0001  max mem: 71357
+[20:37:40.621732] Epoch: [2]  [4580/6500]  lr: 0.000023  closs: 0.7278 (0.7527)  grad_norm: 0.4201 (0.4580)  time: 5.5751  data: 0.0001  max mem: 71357
+[20:38:36.392047] Epoch: [2]  [4590/6500]  lr: 0.000023  closs: 0.6795 (0.7525)  grad_norm: 0.4201 (0.4580)  time: 5.5804  data: 0.0001  max mem: 71357
+[20:39:32.120868] Epoch: [2]  [4600/6500]  lr: 0.000023  closs: 0.6879 (0.7525)  grad_norm: 0.4157 (0.4579)  time: 5.5749  data: 0.0001  max mem: 71357
+[20:40:27.907400] Epoch: [2]  [4610/6500]  lr: 0.000023  closs: 0.7533 (0.7526)  grad_norm: 0.4157 (0.4580)  time: 5.5757  data: 0.0001  max mem: 71357
+[20:41:23.723245] Epoch: [2]  [4620/6500]  lr: 0.000023  closs: 0.7976 (0.7527)  grad_norm: 0.4157 (0.4581)  time: 5.5800  data: 0.0001  max mem: 71357
+[20:42:19.443021] Epoch: [2]  [4630/6500]  lr: 0.000023  closs: 0.8094 (0.7528)  grad_norm: 0.4242 (0.4581)  time: 5.5767  data: 0.0001  max mem: 71357
+[20:43:15.254542] Epoch: [2]  [4640/6500]  lr: 0.000023  closs: 0.8314 (0.7530)  grad_norm: 0.4242 (0.4580)  time: 5.5764  data: 0.0001  max mem: 71357
+[20:44:11.113583] Epoch: [2]  [4650/6500]  lr: 0.000022  closs: 0.7892 (0.7530)  grad_norm: 0.4242 (0.4579)  time: 5.5834  data: 0.0001  max mem: 71357
+[20:45:06.929343] Epoch: [2]  [4660/6500]  lr: 0.000022  closs: 0.7705 (0.7531)  grad_norm: 0.3678 (0.4577)  time: 5.5836  data: 0.0001  max mem: 71357
+[20:46:02.726719] Epoch: [2]  [4670/6500]  lr: 0.000022  closs: 0.7750 (0.7532)  grad_norm: 0.4022 (0.4577)  time: 5.5806  data: 0.0001  max mem: 71357
+[20:46:58.391264] Epoch: [2]  [4680/6500]  lr: 0.000022  closs: 0.7492 (0.7534)  grad_norm: 0.4199 (0.4578)  time: 5.5730  data: 0.0001  max mem: 71357
+[20:47:54.137636] Epoch: [2]  [4690/6500]  lr: 0.000022  closs: 0.7607 (0.7534)  grad_norm: 0.4330 (0.4579)  time: 5.5705  data: 0.0001  max mem: 71357
+[20:48:49.866982] Epoch: [2]  [4700/6500]  lr: 0.000022  closs: 0.7612 (0.7535)  grad_norm: 0.4420 (0.4581)  time: 5.5737  data: 0.0001  max mem: 71357
+[20:49:45.726989] Epoch: [2]  [4710/6500]  lr: 0.000022  closs: 0.7462 (0.7535)  grad_norm: 0.4460 (0.4581)  time: 5.5794  data: 0.0002  max mem: 71357
+[20:50:41.469292] Epoch: [2]  [4720/6500]  lr: 0.000022  closs: 0.7418 (0.7533)  grad_norm: 0.4460 (0.4580)  time: 5.5800  data: 0.0002  max mem: 71357
+[20:51:37.246552] Epoch: [2]  [4730/6500]  lr: 0.000022  closs: 0.7053 (0.7532)  grad_norm: 0.4460 (0.4580)  time: 5.5759  data: 0.0001  max mem: 71357
+[20:52:33.006985] Epoch: [2]  [4740/6500]  lr: 0.000022  closs: 0.7434 (0.7532)  grad_norm: 0.3723 (0.4579)  time: 5.5768  data: 0.0001  max mem: 71357
+[20:53:28.828753] Epoch: [2]  [4750/6500]  lr: 0.000022  closs: 0.7571 (0.7532)  grad_norm: 0.3702 (0.4580)  time: 5.5790  data: 0.0001  max mem: 71357
+[20:54:24.592903] Epoch: [2]  [4760/6500]  lr: 0.000022  closs: 0.7973 (0.7534)  grad_norm: 0.3789 (0.4579)  time: 5.5792  data: 0.0001  max mem: 71357
+[20:55:20.352634] Epoch: [2]  [4770/6500]  lr: 0.000022  closs: 0.7573 (0.7534)  grad_norm: 0.3854 (0.4581)  time: 5.5761  data: 0.0001  max mem: 71357
+[20:56:16.111581] Epoch: [2]  [4780/6500]  lr: 0.000022  closs: 0.7458 (0.7534)  grad_norm: 0.3854 (0.4584)  time: 5.5758  data: 0.0001  max mem: 71357
+[20:57:11.915746] Epoch: [2]  [4790/6500]  lr: 0.000022  closs: 0.7730 (0.7534)  grad_norm: 0.3789 (0.4581)  time: 5.5780  data: 0.0001  max mem: 71357
+[20:58:07.707096] Epoch: [2]  [4800/6500]  lr: 0.000022  closs: 0.7921 (0.7534)  grad_norm: 0.3786 (0.4579)  time: 5.5797  data: 0.0001  max mem: 71357
+[20:59:03.442025] Epoch: [2]  [4810/6500]  lr: 0.000022  closs: 0.8092 (0.7535)  grad_norm: 0.3616 (0.4578)  time: 5.5762  data: 0.0001  max mem: 71357
+[20:59:59.160618] Epoch: [2]  [4820/6500]  lr: 0.000022  closs: 0.8096 (0.7535)  grad_norm: 0.3616 (0.4579)  time: 5.5726  data: 0.0001  max mem: 71357
+[21:00:55.024537] Epoch: [2]  [4830/6500]  lr: 0.000022  closs: 0.7769 (0.7535)  grad_norm: 0.3820 (0.4578)  time: 5.5791  data: 0.0001  max mem: 71357
+[21:01:50.864427] Epoch: [2]  [4840/6500]  lr: 0.000022  closs: 0.8090 (0.7537)  grad_norm: 0.3829 (0.4577)  time: 5.5851  data: 0.0001  max mem: 71357
+[21:02:46.526123] Epoch: [2]  [4850/6500]  lr: 0.000022  closs: 0.7622 (0.7536)  grad_norm: 0.3724 (0.4575)  time: 5.5750  data: 0.0001  max mem: 71357
+[21:03:42.259788] Epoch: [2]  [4860/6500]  lr: 0.000022  closs: 0.6715 (0.7535)  grad_norm: 0.3909 (0.4574)  time: 5.5696  data: 0.0002  max mem: 71357
+[21:04:37.979185] Epoch: [2]  [4870/6500]  lr: 0.000022  closs: 0.7794 (0.7535)  grad_norm: 0.3939 (0.4574)  time: 5.5725  data: 0.0002  max mem: 71357
+[21:05:33.778939] Epoch: [2]  [4880/6500]  lr: 0.000022  closs: 0.7504 (0.7535)  grad_norm: 0.3825 (0.4576)  time: 5.5758  data: 0.0002  max mem: 71357
+[21:06:29.677234] Epoch: [2]  [4890/6500]  lr: 0.000022  closs: 0.7927 (0.7536)  grad_norm: 0.3982 (0.4581)  time: 5.5848  data: 0.0002  max mem: 71357
+[21:07:25.514489] Epoch: [2]  [4900/6500]  lr: 0.000022  closs: 0.7927 (0.7536)  grad_norm: 0.3893 (0.4580)  time: 5.5867  data: 0.0002  max mem: 71357
+[21:08:21.434487] Epoch: [2]  [4910/6500]  lr: 0.000022  closs: 0.6924 (0.7535)  grad_norm: 0.3806 (0.4577)  time: 5.5877  data: 0.0002  max mem: 71357
+[21:09:17.248775] Epoch: [2]  [4920/6500]  lr: 0.000022  closs: 0.6780 (0.7535)  grad_norm: 0.3656 (0.4577)  time: 5.5866  data: 0.0002  max mem: 71357
+[21:10:13.285780] Epoch: [2]  [4930/6500]  lr: 0.000021  closs: 0.7640 (0.7536)  grad_norm: 0.3893 (0.4577)  time: 5.5925  data: 0.0003  max mem: 71357
+[21:11:09.052304] Epoch: [2]  [4940/6500]  lr: 0.000021  closs: 0.7222 (0.7535)  grad_norm: 0.3819 (0.4575)  time: 5.5901  data: 0.0002  max mem: 71357
+[21:12:04.750288] Epoch: [2]  [4950/6500]  lr: 0.000021  closs: 0.7536 (0.7536)  grad_norm: 0.3978 (0.4575)  time: 5.5731  data: 0.0001  max mem: 71357
+[21:13:00.512784] Epoch: [2]  [4960/6500]  lr: 0.000021  closs: 0.7589 (0.7535)  grad_norm: 0.3978 (0.4574)  time: 5.5729  data: 0.0001  max mem: 71357
+[21:13:56.368221] Epoch: [2]  [4970/6500]  lr: 0.000021  closs: 0.6880 (0.7534)  grad_norm: 0.4164 (0.4574)  time: 5.5808  data: 0.0001  max mem: 71357
+[21:14:52.098157] Epoch: [2]  [4980/6500]  lr: 0.000021  closs: 0.6675 (0.7533)  grad_norm: 0.4466 (0.4576)  time: 5.5792  data: 0.0002  max mem: 71357
+[21:15:47.834844] Epoch: [2]  [4990/6500]  lr: 0.000021  closs: 0.7294 (0.7534)  grad_norm: 0.4218 (0.4576)  time: 5.5732  data: 0.0002  max mem: 71357
+[21:16:43.616335] Epoch: [2]  [5000/6500]  lr: 0.000021  closs: 0.7294 (0.7533)  grad_norm: 0.4392 (0.4575)  time: 5.5758  data: 0.0001  max mem: 71357
+[21:17:39.418621] Epoch: [2]  [5010/6500]  lr: 0.000021  closs: 0.6945 (0.7532)  grad_norm: 0.3921 (0.4575)  time: 5.5791  data: 0.0001  max mem: 71357
+[21:18:35.268740] Epoch: [2]  [5020/6500]  lr: 0.000021  closs: 0.7293 (0.7533)  grad_norm: 0.3777 (0.4574)  time: 5.5825  data: 0.0001  max mem: 71357
+[21:19:31.074686] Epoch: [2]  [5030/6500]  lr: 0.000021  closs: 0.7293 (0.7532)  grad_norm: 0.3928 (0.4573)  time: 5.5827  data: 0.0002  max mem: 71357
+[21:20:26.820398] Epoch: [2]  [5040/6500]  lr: 0.000021  closs: 0.7199 (0.7531)  grad_norm: 0.4195 (0.4573)  time: 5.5774  data: 0.0002  max mem: 71357
+[21:21:22.626457] Epoch: [2]  [5050/6500]  lr: 0.000021  closs: 0.7394 (0.7530)  grad_norm: 0.3752 (0.4572)  time: 5.5775  data: 0.0001  max mem: 71357
+[21:22:18.529363] Epoch: [2]  [5060/6500]  lr: 0.000021  closs: 0.6648 (0.7528)  grad_norm: 0.3752 (0.4571)  time: 5.5854  data: 0.0001  max mem: 71357
+[21:23:14.371468] Epoch: [2]  [5070/6500]  lr: 0.000021  closs: 0.6893 (0.7528)  grad_norm: 0.3752 (0.4574)  time: 5.5872  data: 0.0002  max mem: 71357
+[21:24:10.166597] Epoch: [2]  [5080/6500]  lr: 0.000021  closs: 0.8464 (0.7531)  grad_norm: 0.3935 (0.4573)  time: 5.5818  data: 0.0002  max mem: 71357
+[21:25:06.007932] Epoch: [2]  [5090/6500]  lr: 0.000021  closs: 0.7938 (0.7531)  grad_norm: 0.4019 (0.4572)  time: 5.5817  data: 0.0002  max mem: 71357
+[21:26:01.836094] Epoch: [2]  [5100/6500]  lr: 0.000021  closs: 0.7887 (0.7532)  grad_norm: 0.4274 (0.4577)  time: 5.5833  data: 0.0002  max mem: 71357
+[21:26:57.816257] Epoch: [2]  [5110/6500]  lr: 0.000021  closs: 0.7349 (0.7531)  grad_norm: 0.4136 (0.4577)  time: 5.5903  data: 0.0002  max mem: 71357
+[21:27:53.625174] Epoch: [2]  [5120/6500]  lr: 0.000021  closs: 0.7168 (0.7532)  grad_norm: 0.4136 (0.4575)  time: 5.5893  data: 0.0003  max mem: 71357
+[21:28:49.364341] Epoch: [2]  [5130/6500]  lr: 0.000021  closs: 0.7883 (0.7531)  grad_norm: 0.4200 (0.4574)  time: 5.5773  data: 0.0002  max mem: 71357
+[21:29:45.116681] Epoch: [2]  [5140/6500]  lr: 0.000021  closs: 0.8003 (0.7533)  grad_norm: 0.3964 (0.4575)  time: 5.5744  data: 0.0002  max mem: 71357
+[21:30:41.097664] Epoch: [2]  [5150/6500]  lr: 0.000021  closs: 0.7947 (0.7533)  grad_norm: 0.4253 (0.4577)  time: 5.5865  data: 0.0002  max mem: 71357
+[21:31:36.852342] Epoch: [2]  [5160/6500]  lr: 0.000021  closs: 0.6738 (0.7530)  grad_norm: 0.4253 (0.4575)  time: 5.5866  data: 0.0002  max mem: 71357
+[21:32:32.634601] Epoch: [2]  [5170/6500]  lr: 0.000021  closs: 0.6738 (0.7530)  grad_norm: 0.4441 (0.4575)  time: 5.5767  data: 0.0002  max mem: 71357
+[21:33:28.375412] Epoch: [2]  [5180/6500]  lr: 0.000021  closs: 0.7572 (0.7529)  grad_norm: 0.4158 (0.4575)  time: 5.5760  data: 0.0001  max mem: 71357
+[21:34:24.373049] Epoch: [2]  [5190/6500]  lr: 0.000021  closs: 0.7376 (0.7530)  grad_norm: 0.3803 (0.4574)  time: 5.5868  data: 0.0001  max mem: 71357
+[21:35:20.090392] Epoch: [2]  [5200/6500]  lr: 0.000021  closs: 0.7732 (0.7529)  grad_norm: 0.3696 (0.4575)  time: 5.5856  data: 0.0001  max mem: 71357
+[21:36:15.790968] Epoch: [2]  [5210/6500]  lr: 0.000021  closs: 0.7453 (0.7529)  grad_norm: 0.3696 (0.4573)  time: 5.5708  data: 0.0001  max mem: 71357
+[21:37:11.636798] Epoch: [2]  [5220/6500]  lr: 0.000020  closs: 0.7165 (0.7528)  grad_norm: 0.3760 (0.4576)  time: 5.5772  data: 0.0001  max mem: 71357
+[21:38:07.477832] Epoch: [2]  [5230/6500]  lr: 0.000020  closs: 0.7192 (0.7528)  grad_norm: 0.3629 (0.4574)  time: 5.5842  data: 0.0001  max mem: 71357
+[21:39:03.327791] Epoch: [2]  [5240/6500]  lr: 0.000020  closs: 0.7629 (0.7528)  grad_norm: 0.4283 (0.4577)  time: 5.5844  data: 0.0001  max mem: 71357
+[21:39:59.108460] Epoch: [2]  [5250/6500]  lr: 0.000020  closs: 0.7569 (0.7529)  grad_norm: 0.4310 (0.4577)  time: 5.5814  data: 0.0002  max mem: 71357
+[21:40:54.871076] Epoch: [2]  [5260/6500]  lr: 0.000020  closs: 0.7255 (0.7528)  grad_norm: 0.4283 (0.4581)  time: 5.5771  data: 0.0002  max mem: 71357
+[21:41:50.589820] Epoch: [2]  [5270/6500]  lr: 0.000020  closs: 0.7657 (0.7530)  grad_norm: 0.4283 (0.4580)  time: 5.5740  data: 0.0001  max mem: 71357
+[21:42:46.474046] Epoch: [2]  [5280/6500]  lr: 0.000020  closs: 0.7840 (0.7530)  grad_norm: 0.3961 (0.4581)  time: 5.5801  data: 0.0001  max mem: 71357
+[21:43:42.200918] Epoch: [2]  [5290/6500]  lr: 0.000020  closs: 0.7246 (0.7530)  grad_norm: 0.4003 (0.4582)  time: 5.5805  data: 0.0001  max mem: 71357
+[21:44:37.997141] Epoch: [2]  [5300/6500]  lr: 0.000020  closs: 0.7503 (0.7530)  grad_norm: 0.3962 (0.4580)  time: 5.5760  data: 0.0001  max mem: 71357
+[21:45:33.699151] Epoch: [2]  [5310/6500]  lr: 0.000020  closs: 0.7254 (0.7531)  grad_norm: 0.4213 (0.4580)  time: 5.5748  data: 0.0001  max mem: 71357
+[21:46:29.584831] Epoch: [2]  [5320/6500]  lr: 0.000020  closs: 0.8009 (0.7532)  grad_norm: 0.4213 (0.4581)  time: 5.5793  data: 0.0001  max mem: 71357
+[21:47:25.383542] Epoch: [2]  [5330/6500]  lr: 0.000020  closs: 0.8009 (0.7533)  grad_norm: 0.4237 (0.4581)  time: 5.5841  data: 0.0001  max mem: 71357
+[21:48:21.194631] Epoch: [2]  [5340/6500]  lr: 0.000020  closs: 0.8154 (0.7535)  grad_norm: 0.4372 (0.4580)  time: 5.5804  data: 0.0001  max mem: 71357
+[21:49:16.955311] Epoch: [2]  [5350/6500]  lr: 0.000020  closs: 0.7530 (0.7534)  grad_norm: 0.4259 (0.4580)  time: 5.5785  data: 0.0001  max mem: 71357
+[21:50:12.677438] Epoch: [2]  [5360/6500]  lr: 0.000020  closs: 0.7140 (0.7534)  grad_norm: 0.4259 (0.4581)  time: 5.5741  data: 0.0001  max mem: 71357
+[21:51:08.506115] Epoch: [2]  [5370/6500]  lr: 0.000020  closs: 0.7140 (0.7534)  grad_norm: 0.3761 (0.4579)  time: 5.5774  data: 0.0001  max mem: 71357
+[21:52:04.259279] Epoch: [2]  [5380/6500]  lr: 0.000020  closs: 0.6909 (0.7533)  grad_norm: 0.3761 (0.4579)  time: 5.5790  data: 0.0001  max mem: 71357
+[21:53:00.007049] Epoch: [2]  [5390/6500]  lr: 0.000020  closs: 0.7034 (0.7533)  grad_norm: 0.3758 (0.4578)  time: 5.5750  data: 0.0001  max mem: 71357
+[21:53:55.686501] Epoch: [2]  [5400/6500]  lr: 0.000020  closs: 0.7651 (0.7533)  grad_norm: 0.3841 (0.4578)  time: 5.5713  data: 0.0001  max mem: 71357
+[21:54:51.514514] Epoch: [2]  [5410/6500]  lr: 0.000020  closs: 0.7785 (0.7534)  grad_norm: 0.4139 (0.4580)  time: 5.5753  data: 0.0001  max mem: 71357
+[21:55:47.154445] Epoch: [2]  [5420/6500]  lr: 0.000020  closs: 0.7544 (0.7533)  grad_norm: 0.4081 (0.4579)  time: 5.5733  data: 0.0001  max mem: 71357
+[21:56:42.951537] Epoch: [2]  [5430/6500]  lr: 0.000020  closs: 0.6845 (0.7532)  grad_norm: 0.4150 (0.4579)  time: 5.5717  data: 0.0001  max mem: 71357
+[21:57:38.676644] Epoch: [2]  [5440/6500]  lr: 0.000020  closs: 0.8226 (0.7534)  grad_norm: 0.4081 (0.4578)  time: 5.5760  data: 0.0001  max mem: 71357
+[21:58:34.461701] Epoch: [2]  [5450/6500]  lr: 0.000020  closs: 0.7645 (0.7533)  grad_norm: 0.4039 (0.4578)  time: 5.5754  data: 0.0001  max mem: 71357
+[21:59:30.295020] Epoch: [2]  [5460/6500]  lr: 0.000020  closs: 0.7645 (0.7535)  grad_norm: 0.4783 (0.4581)  time: 5.5808  data: 0.0001  max mem: 71357
+[22:00:25.992583] Epoch: [2]  [5470/6500]  lr: 0.000020  closs: 0.8520 (0.7536)  grad_norm: 0.4082 (0.4580)  time: 5.5765  data: 0.0001  max mem: 71357
+[22:01:21.691970] Epoch: [2]  [5480/6500]  lr: 0.000020  closs: 0.7229 (0.7536)  grad_norm: 0.4799 (0.4582)  time: 5.5697  data: 0.0001  max mem: 71357
+[22:02:17.516596] Epoch: [2]  [5490/6500]  lr: 0.000020  closs: 0.7057 (0.7536)  grad_norm: 0.4082 (0.4581)  time: 5.5761  data: 0.0001  max mem: 71357
+[22:03:13.445909] Epoch: [2]  [5500/6500]  lr: 0.000020  closs: 0.6673 (0.7533)  grad_norm: 0.3902 (0.4579)  time: 5.5876  data: 0.0001  max mem: 71357
+[22:04:09.147708] Epoch: [2]  [5510/6500]  lr: 0.000019  closs: 0.6875 (0.7535)  grad_norm: 0.3914 (0.4580)  time: 5.5814  data: 0.0001  max mem: 71357
+[22:05:04.874061] Epoch: [2]  [5520/6500]  lr: 0.000019  closs: 0.7620 (0.7535)  grad_norm: 0.3816 (0.4579)  time: 5.5713  data: 0.0002  max mem: 71357
+[22:06:00.681888] Epoch: [2]  [5530/6500]  lr: 0.000019  closs: 0.7095 (0.7535)  grad_norm: 0.3783 (0.4601)  time: 5.5766  data: 0.0002  max mem: 71357
+[22:06:56.443705] Epoch: [2]  [5540/6500]  lr: 0.000019  closs: 0.7318 (0.7536)  grad_norm: 0.4034 (0.4601)  time: 5.5784  data: 0.0001  max mem: 71357
+[22:07:52.418745] Epoch: [2]  [5550/6500]  lr: 0.000019  closs: 0.7458 (0.7538)  grad_norm: 0.3594 (0.4599)  time: 5.5867  data: 0.0001  max mem: 71357
+[22:08:48.144479] Epoch: [2]  [5560/6500]  lr: 0.000019  closs: 0.7224 (0.7537)  grad_norm: 0.4024 (0.4599)  time: 5.5849  data: 0.0001  max mem: 71357
+[22:09:43.897713] Epoch: [2]  [5570/6500]  lr: 0.000019  closs: 0.7157 (0.7537)  grad_norm: 0.3973 (0.4597)  time: 5.5739  data: 0.0001  max mem: 71357
+[22:10:39.690994] Epoch: [2]  [5580/6500]  lr: 0.000019  closs: 0.7680 (0.7539)  grad_norm: 0.3898 (0.4596)  time: 5.5773  data: 0.0001  max mem: 71357
+[22:11:35.469045] Epoch: [2]  [5590/6500]  lr: 0.000019  closs: 0.7702 (0.7539)  grad_norm: 0.3973 (0.4594)  time: 5.5785  data: 0.0001  max mem: 71357
+[22:12:31.188413] Epoch: [2]  [5600/6500]  lr: 0.000019  closs: 0.7568 (0.7539)  grad_norm: 0.3946 (0.4593)  time: 5.5747  data: 0.0001  max mem: 71357
+[22:13:26.862699] Epoch: [2]  [5610/6500]  lr: 0.000019  closs: 0.7371 (0.7539)  grad_norm: 0.3946 (0.4594)  time: 5.5696  data: 0.0001  max mem: 71357
+[22:14:22.622269] Epoch: [2]  [5620/6500]  lr: 0.000019  closs: 0.7695 (0.7540)  grad_norm: 0.4004 (0.4594)  time: 5.5716  data: 0.0001  max mem: 71357
+[22:15:18.453726] Epoch: [2]  [5630/6500]  lr: 0.000019  closs: 0.7800 (0.7540)  grad_norm: 0.3960 (0.4595)  time: 5.5795  data: 0.0001  max mem: 71357
+[22:16:14.272135] Epoch: [2]  [5640/6500]  lr: 0.000019  closs: 0.7343 (0.7540)  grad_norm: 0.3889 (0.4594)  time: 5.5824  data: 0.0001  max mem: 71357
+[22:17:10.010456] Epoch: [2]  [5650/6500]  lr: 0.000019  closs: 0.7035 (0.7539)  grad_norm: 0.3810 (0.4594)  time: 5.5777  data: 0.0001  max mem: 71357
+[22:18:05.780687] Epoch: [2]  [5660/6500]  lr: 0.000019  closs: 0.7525 (0.7540)  grad_norm: 0.3810 (0.4593)  time: 5.5753  data: 0.0001  max mem: 71357
+[22:19:01.603251] Epoch: [2]  [5670/6500]  lr: 0.000019  closs: 0.7833 (0.7539)  grad_norm: 0.3969 (0.4594)  time: 5.5795  data: 0.0001  max mem: 71357
+[22:19:57.464364] Epoch: [2]  [5680/6500]  lr: 0.000019  closs: 0.6852 (0.7538)  grad_norm: 0.4027 (0.4592)  time: 5.5841  data: 0.0002  max mem: 71357
+[22:20:53.232893] Epoch: [2]  [5690/6500]  lr: 0.000019  closs: 0.7009 (0.7537)  grad_norm: 0.3941 (0.4593)  time: 5.5814  data: 0.0002  max mem: 71357
+[22:21:49.061197] Epoch: [2]  [5700/6500]  lr: 0.000019  closs: 0.7428 (0.7538)  grad_norm: 0.3807 (0.4591)  time: 5.5798  data: 0.0001  max mem: 71357
+[22:22:44.755870] Epoch: [2]  [5710/6500]  lr: 0.000019  closs: 0.7428 (0.7538)  grad_norm: 0.3692 (0.4591)  time: 5.5761  data: 0.0001  max mem: 71357
+[22:23:40.624436] Epoch: [2]  [5720/6500]  lr: 0.000019  closs: 0.6894 (0.7538)  grad_norm: 0.3692 (0.4590)  time: 5.5780  data: 0.0001  max mem: 71357
+[22:24:36.360822] Epoch: [2]  [5730/6500]  lr: 0.000019  closs: 0.7312 (0.7538)  grad_norm: 0.3719 (0.4599)  time: 5.5801  data: 0.0001  max mem: 71357
+[22:25:32.063175] Epoch: [2]  [5740/6500]  lr: 0.000019  closs: 0.7312 (0.7537)  grad_norm: 0.4130 (0.4600)  time: 5.5719  data: 0.0001  max mem: 71357
+[22:26:27.774640] Epoch: [2]  [5750/6500]  lr: 0.000019  closs: 0.6550 (0.7536)  grad_norm: 0.4130 (0.4601)  time: 5.5706  data: 0.0001  max mem: 71357
+[22:27:23.569992] Epoch: [2]  [5760/6500]  lr: 0.000019  closs: 0.6720 (0.7535)  grad_norm: 0.4130 (0.4600)  time: 5.5753  data: 0.0001  max mem: 71357
+[22:28:19.393765] Epoch: [2]  [5770/6500]  lr: 0.000019  closs: 0.6776 (0.7535)  grad_norm: 0.4095 (0.4599)  time: 5.5809  data: 0.0001  max mem: 71357
+[22:29:15.058736] Epoch: [2]  [5780/6500]  lr: 0.000019  closs: 0.7125 (0.7535)  grad_norm: 0.3872 (0.4598)  time: 5.5743  data: 0.0001  max mem: 71357
+[22:30:10.761069] Epoch: [2]  [5790/6500]  lr: 0.000019  closs: 0.7571 (0.7536)  grad_norm: 0.3880 (0.4604)  time: 5.5683  data: 0.0001  max mem: 71357
+[22:31:06.473078] Epoch: [2]  [5800/6500]  lr: 0.000019  closs: 0.7571 (0.7535)  grad_norm: 0.4095 (0.4604)  time: 5.5706  data: 0.0001  max mem: 71357
+[22:32:02.241747] Epoch: [2]  [5810/6500]  lr: 0.000018  closs: 0.7479 (0.7536)  grad_norm: 0.4455 (0.4605)  time: 5.5740  data: 0.0001  max mem: 71357
+[22:32:57.961342] Epoch: [2]  [5820/6500]  lr: 0.000018  closs: 0.7479 (0.7535)  grad_norm: 0.4557 (0.4606)  time: 5.5743  data: 0.0001  max mem: 71357
+[22:33:53.730026] Epoch: [2]  [5830/6500]  lr: 0.000018  closs: 0.7750 (0.7537)  grad_norm: 0.4460 (0.4607)  time: 5.5743  data: 0.0001  max mem: 71357
+[22:34:49.531097] Epoch: [2]  [5840/6500]  lr: 0.000018  closs: 0.7832 (0.7537)  grad_norm: 0.4754 (0.4607)  time: 5.5784  data: 0.0002  max mem: 71357
+[22:35:45.415785] Epoch: [2]  [5850/6500]  lr: 0.000018  closs: 0.7469 (0.7538)  grad_norm: 0.4186 (0.4606)  time: 5.5842  data: 0.0002  max mem: 71357
+[22:36:41.194513] Epoch: [2]  [5860/6500]  lr: 0.000018  closs: 0.7760 (0.7538)  grad_norm: 0.3871 (0.4605)  time: 5.5831  data: 0.0001  max mem: 71357
+[22:37:36.993239] Epoch: [2]  [5870/6500]  lr: 0.000018  closs: 0.7566 (0.7538)  grad_norm: 0.3684 (0.4603)  time: 5.5788  data: 0.0001  max mem: 71357
+[22:38:32.745756] Epoch: [2]  [5880/6500]  lr: 0.000018  closs: 0.7442 (0.7537)  grad_norm: 0.3798 (0.4603)  time: 5.5774  data: 0.0001  max mem: 71357
+[22:39:28.509506] Epoch: [2]  [5890/6500]  lr: 0.000018  closs: 0.7118 (0.7536)  grad_norm: 0.3814 (0.4602)  time: 5.5757  data: 0.0001  max mem: 71357
+[22:40:24.433869] Epoch: [2]  [5900/6500]  lr: 0.000018  closs: 0.6788 (0.7535)  grad_norm: 0.3861 (0.4602)  time: 5.5843  data: 0.0001  max mem: 71357
+[22:41:20.129147] Epoch: [2]  [5910/6500]  lr: 0.000018  closs: 0.7175 (0.7536)  grad_norm: 0.4095 (0.4603)  time: 5.5809  data: 0.0001  max mem: 71357
+[22:42:15.865346] Epoch: [2]  [5920/6500]  lr: 0.000018  closs: 0.7207 (0.7535)  grad_norm: 0.4065 (0.4602)  time: 5.5715  data: 0.0001  max mem: 71357
+[22:43:11.634563] Epoch: [2]  [5930/6500]  lr: 0.000018  closs: 0.7174 (0.7535)  grad_norm: 0.4065 (0.4613)  time: 5.5752  data: 0.0001  max mem: 71357
+[22:44:07.473739] Epoch: [2]  [5940/6500]  lr: 0.000018  closs: 0.7479 (0.7535)  grad_norm: 0.3961 (0.4614)  time: 5.5803  data: 0.0001  max mem: 71357
+[22:45:03.266670] Epoch: [2]  [5950/6500]  lr: 0.000018  closs: 0.7342 (0.7535)  grad_norm: 0.3684 (0.4612)  time: 5.5815  data: 0.0002  max mem: 71357
+[22:45:59.020127] Epoch: [2]  [5960/6500]  lr: 0.000018  closs: 0.6752 (0.7534)  grad_norm: 0.3959 (0.4612)  time: 5.5772  data: 0.0002  max mem: 71357
+[22:46:54.732305] Epoch: [2]  [5970/6500]  lr: 0.000018  closs: 0.7144 (0.7534)  grad_norm: 0.4040 (0.4613)  time: 5.5732  data: 0.0001  max mem: 71357
+[22:47:50.554503] Epoch: [2]  [5980/6500]  lr: 0.000018  closs: 0.7962 (0.7534)  grad_norm: 0.4040 (0.4624)  time: 5.5766  data: 0.0001  max mem: 71357
+[22:48:46.306751] Epoch: [2]  [5990/6500]  lr: 0.000018  closs: 0.7715 (0.7535)  grad_norm: 0.3959 (0.4623)  time: 5.5786  data: 0.0001  max mem: 71357
+[22:49:41.974220] Epoch: [2]  [6000/6500]  lr: 0.000018  closs: 0.7402 (0.7536)  grad_norm: 0.3846 (0.4624)  time: 5.5709  data: 0.0001  max mem: 71357
+[22:50:37.783659] Epoch: [2]  [6010/6500]  lr: 0.000018  closs: 0.7132 (0.7535)  grad_norm: 0.3823 (0.4623)  time: 5.5738  data: 0.0001  max mem: 71357
+[22:51:33.639991] Epoch: [2]  [6020/6500]  lr: 0.000018  closs: 0.7106 (0.7535)  grad_norm: 0.3712 (0.4622)  time: 5.5832  data: 0.0001  max mem: 71357
+[22:52:29.438869] Epoch: [2]  [6030/6500]  lr: 0.000018  closs: 0.7124 (0.7535)  grad_norm: 0.3909 (0.4623)  time: 5.5827  data: 0.0001  max mem: 71357
+[22:53:25.189254] Epoch: [2]  [6040/6500]  lr: 0.000018  closs: 0.7124 (0.7534)  grad_norm: 0.3894 (0.4622)  time: 5.5774  data: 0.0001  max mem: 71357
+[22:54:20.915316] Epoch: [2]  [6050/6500]  lr: 0.000018  closs: 0.7851 (0.7536)  grad_norm: 0.3782 (0.4621)  time: 5.5737  data: 0.0001  max mem: 71357
+[22:55:16.695819] Epoch: [2]  [6060/6500]  lr: 0.000018  closs: 0.7867 (0.7536)  grad_norm: 0.3782 (0.4621)  time: 5.5752  data: 0.0002  max mem: 71357
+[22:56:12.546878] Epoch: [2]  [6070/6500]  lr: 0.000018  closs: 0.7574 (0.7536)  grad_norm: 0.3723 (0.4619)  time: 5.5815  data: 0.0002  max mem: 71357
+[22:57:08.327277] Epoch: [2]  [6080/6500]  lr: 0.000018  closs: 0.6751 (0.7537)  grad_norm: 0.3723 (0.4619)  time: 5.5815  data: 0.0001  max mem: 71357
+[22:58:04.087779] Epoch: [2]  [6090/6500]  lr: 0.000018  closs: 0.7851 (0.7536)  grad_norm: 0.3784 (0.4620)  time: 5.5769  data: 0.0001  max mem: 71357
+[22:58:59.916758] Epoch: [2]  [6100/6500]  lr: 0.000018  closs: 0.7344 (0.7536)  grad_norm: 0.4472 (0.4621)  time: 5.5794  data: 0.0001  max mem: 71357
+[22:59:55.759053] Epoch: [2]  [6110/6500]  lr: 0.000017  closs: 0.7704 (0.7537)  grad_norm: 0.4621 (0.4621)  time: 5.5834  data: 0.0002  max mem: 71357
+[23:00:51.574760] Epoch: [2]  [6120/6500]  lr: 0.000017  closs: 0.8100 (0.7537)  grad_norm: 0.4472 (0.4620)  time: 5.5828  data: 0.0002  max mem: 71357
+[23:01:47.278852] Epoch: [2]  [6130/6500]  lr: 0.000017  closs: 0.7380 (0.7537)  grad_norm: 0.4204 (0.4620)  time: 5.5759  data: 0.0001  max mem: 71357
+[23:02:43.031933] Epoch: [2]  [6140/6500]  lr: 0.000017  closs: 0.7306 (0.7537)  grad_norm: 0.4444 (0.4623)  time: 5.5728  data: 0.0001  max mem: 71357
+[23:03:38.769960] Epoch: [2]  [6150/6500]  lr: 0.000017  closs: 0.7840 (0.7539)  grad_norm: 0.4829 (0.4623)  time: 5.5745  data: 0.0001  max mem: 71357
+[23:04:34.600484] Epoch: [2]  [6160/6500]  lr: 0.000017  closs: 0.7840 (0.7539)  grad_norm: 0.4829 (0.4622)  time: 5.5783  data: 0.0001  max mem: 71357
+[23:05:30.286996] Epoch: [2]  [6170/6500]  lr: 0.000017  closs: 0.7545 (0.7540)  grad_norm: 0.4356 (0.4622)  time: 5.5758  data: 0.0001  max mem: 71357
+[23:06:25.993506] Epoch: [2]  [6180/6500]  lr: 0.000017  closs: 0.7504 (0.7539)  grad_norm: 0.4211 (0.4621)  time: 5.5696  data: 0.0001  max mem: 71357
+[23:07:21.647010] Epoch: [2]  [6190/6500]  lr: 0.000017  closs: 0.7039 (0.7539)  grad_norm: 0.4211 (0.4628)  time: 5.5679  data: 0.0001  max mem: 71357
+[23:08:17.362028] Epoch: [2]  [6200/6500]  lr: 0.000017  closs: 0.6685 (0.7538)  grad_norm: 0.4351 (0.4630)  time: 5.5684  data: 0.0001  max mem: 71357
+[23:09:13.186215] Epoch: [2]  [6210/6500]  lr: 0.000017  closs: 0.7096 (0.7538)  grad_norm: 0.4211 (0.4629)  time: 5.5769  data: 0.0001  max mem: 71357
+[23:10:08.928622] Epoch: [2]  [6220/6500]  lr: 0.000017  closs: 0.7746 (0.7537)  grad_norm: 0.4352 (0.4630)  time: 5.5782  data: 0.0001  max mem: 71357
+[23:11:04.652755] Epoch: [2]  [6230/6500]  lr: 0.000017  closs: 0.7821 (0.7538)  grad_norm: 0.4103 (0.4630)  time: 5.5732  data: 0.0001  max mem: 71357
+[23:12:00.419833] Epoch: [2]  [6240/6500]  lr: 0.000017  closs: 0.7860 (0.7538)  grad_norm: 0.4103 (0.4630)  time: 5.5745  data: 0.0001  max mem: 71357
+[23:12:56.282491] Epoch: [2]  [6250/6500]  lr: 0.000017  closs: 0.7497 (0.7538)  grad_norm: 0.4111 (0.4630)  time: 5.5814  data: 0.0001  max mem: 71357
+[23:13:52.002571] Epoch: [2]  [6260/6500]  lr: 0.000017  closs: 0.6927 (0.7537)  grad_norm: 0.4111 (0.4630)  time: 5.5791  data: 0.0001  max mem: 71357
+[23:14:47.790512] Epoch: [2]  [6270/6500]  lr: 0.000017  closs: 0.6601 (0.7535)  grad_norm: 0.4325 (0.4631)  time: 5.5753  data: 0.0001  max mem: 71357
+[23:15:43.475077] Epoch: [2]  [6280/6500]  lr: 0.000017  closs: 0.7613 (0.7537)  grad_norm: 0.4325 (0.4631)  time: 5.5735  data: 0.0001  max mem: 71357
+[23:16:39.282617] Epoch: [2]  [6290/6500]  lr: 0.000017  closs: 0.7696 (0.7537)  grad_norm: 0.4325 (0.4631)  time: 5.5745  data: 0.0001  max mem: 71357
+[23:17:34.943611] Epoch: [2]  [6300/6500]  lr: 0.000017  closs: 0.6945 (0.7536)  grad_norm: 0.4325 (0.4631)  time: 5.5733  data: 0.0001  max mem: 71357
+[23:18:30.788632] Epoch: [2]  [6310/6500]  lr: 0.000017  closs: 0.6751 (0.7535)  grad_norm: 0.3664 (0.4629)  time: 5.5752  data: 0.0001  max mem: 71357
+[23:19:26.585278] Epoch: [2]  [6320/6500]  lr: 0.000017  closs: 0.7299 (0.7535)  grad_norm: 0.3616 (0.4629)  time: 5.5820  data: 0.0001  max mem: 71357
+[23:20:22.397220] Epoch: [2]  [6330/6500]  lr: 0.000017  closs: 0.7299 (0.7535)  grad_norm: 0.3927 (0.4630)  time: 5.5803  data: 0.0002  max mem: 71357
+[23:21:18.332983] Epoch: [2]  [6340/6500]  lr: 0.000017  closs: 0.7740 (0.7536)  grad_norm: 0.3840 (0.4629)  time: 5.5873  data: 0.0002  max mem: 71357
+[23:22:14.017952] Epoch: [2]  [6350/6500]  lr: 0.000017  closs: 0.7530 (0.7536)  grad_norm: 0.4238 (0.4629)  time: 5.5809  data: 0.0002  max mem: 71357
+[23:23:09.970496] Epoch: [2]  [6360/6500]  lr: 0.000017  closs: 0.6566 (0.7535)  grad_norm: 0.3840 (0.4627)  time: 5.5818  data: 0.0002  max mem: 71357
+[23:24:05.875075] Epoch: [2]  [6370/6500]  lr: 0.000017  closs: 0.6932 (0.7535)  grad_norm: 0.3840 (0.4627)  time: 5.5928  data: 0.0002  max mem: 71357
+[23:25:01.751484] Epoch: [2]  [6380/6500]  lr: 0.000017  closs: 0.6841 (0.7535)  grad_norm: 0.4111 (0.4626)  time: 5.5890  data: 0.0002  max mem: 71357
+[23:25:57.533484] Epoch: [2]  [6390/6500]  lr: 0.000017  closs: 0.7375 (0.7535)  grad_norm: 0.3716 (0.4625)  time: 5.5828  data: 0.0002  max mem: 71357
+[23:26:53.655347] Epoch: [2]  [6400/6500]  lr: 0.000017  closs: 0.7540 (0.7535)  grad_norm: 0.4111 (0.4624)  time: 5.5946  data: 0.0002  max mem: 71357
+[23:27:49.479003] Epoch: [2]  [6410/6500]  lr: 0.000017  closs: 0.6857 (0.7535)  grad_norm: 0.3663 (0.4622)  time: 5.5967  data: 0.0002  max mem: 71357
+[23:28:45.243961] Epoch: [2]  [6420/6500]  lr: 0.000017  closs: 0.7293 (0.7535)  grad_norm: 0.3680 (0.4622)  time: 5.5794  data: 0.0002  max mem: 71357
+[23:29:41.165348] Epoch: [2]  [6430/6500]  lr: 0.000016  closs: 0.7293 (0.7536)  grad_norm: 0.3680 (0.4621)  time: 5.5842  data: 0.0001  max mem: 71357
+[23:30:37.000964] Epoch: [2]  [6440/6500]  lr: 0.000016  closs: 0.6909 (0.7535)  grad_norm: 0.3927 (0.4620)  time: 5.5878  data: 0.0001  max mem: 71357
+[23:31:32.684603] Epoch: [2]  [6450/6500]  lr: 0.000016  closs: 0.7128 (0.7534)  grad_norm: 0.3942 (0.4621)  time: 5.5759  data: 0.0001  max mem: 71357
+[23:32:28.371373] Epoch: [2]  [6460/6500]  lr: 0.000016  closs: 0.7128 (0.7534)  grad_norm: 0.3942 (0.4621)  time: 5.5685  data: 0.0001  max mem: 71357
+[23:33:24.189082] Epoch: [2]  [6470/6500]  lr: 0.000016  closs: 0.6997 (0.7533)  grad_norm: 0.3942 (0.4619)  time: 5.5752  data: 0.0001  max mem: 71357
+[23:34:19.829473] Epoch: [2]  [6480/6500]  lr: 0.000016  closs: 0.6889 (0.7533)  grad_norm: 0.3942 (0.4619)  time: 5.5728  data: 0.0001  max mem: 71357
+[23:35:15.552398] Epoch: [2]  [6490/6500]  lr: 0.000016  closs: 0.7060 (0.7532)  grad_norm: 0.3979 (0.4619)  time: 5.5681  data: 0.0001  max mem: 71357
+[23:36:06.410367] Epoch: [2] Total time: 10:04:27
+[23:36:06.411251] Averaged stats: lr: 0.000016  closs: 0.7366 (0.7503)  grad_norm: 0.3976 (0.4619)
+[23:36:06.578363] model saved
+[23:36:07.582844] optimizer saved
+[23:36:07.583305] other rank-common saved
+[23:36:07.586540] rank-specific saved
+[23:36:07.595987] log_dir: ./output_dir
+[23:36:15.789670] Epoch: [3]  [0/6500]  lr: 0.000016  closs: 0.6146 (0.6146)  time: 8.1929  data: 2.6079  max mem: 71357
+[23:37:11.511729] Epoch: [3]  [10/6500]  lr: 0.000016  closs: 0.7006 (0.7128)  grad_norm: 0.4318 (0.4230)  time: 5.8104  data: 0.2372  max mem: 71357
+[23:38:07.453950] Epoch: [3]  [20/6500]  lr: 0.000016  closs: 0.7074 (0.7354)  grad_norm: 0.3599 (0.4300)  time: 5.5831  data: 0.0001  max mem: 71357
+[23:39:03.188210] Epoch: [3]  [30/6500]  lr: 0.000016  closs: 0.7432 (0.7265)  grad_norm: 0.3704 (0.4030)  time: 5.5837  data: 0.0001  max mem: 71357
+[23:39:58.904121] Epoch: [3]  [40/6500]  lr: 0.000016  closs: 0.7133 (0.7253)  grad_norm: 0.3738 (0.4072)  time: 5.5724  data: 0.0001  max mem: 71357
+[23:40:54.597672] Epoch: [3]  [50/6500]  lr: 0.000016  closs: 0.7198 (0.7230)  grad_norm: 0.3704 (0.4126)  time: 5.5704  data: 0.0001  max mem: 71357
+[23:41:50.339082] Epoch: [3]  [60/6500]  lr: 0.000016  closs: 0.7353 (0.7339)  grad_norm: 0.3956 (0.4189)  time: 5.5716  data: 0.0001  max mem: 71357
+[23:42:46.233960] Epoch: [3]  [70/6500]  lr: 0.000016  closs: 0.8521 (0.7525)  grad_norm: 0.3956 (0.4162)  time: 5.5817  data: 0.0001  max mem: 71357
+[23:43:41.956645] Epoch: [3]  [80/6500]  lr: 0.000016  closs: 0.7817 (0.7474)  grad_norm: 0.3673 (0.4140)  time: 5.5808  data: 0.0001  max mem: 71357
+[23:44:37.633777] Epoch: [3]  [90/6500]  lr: 0.000016  closs: 0.6593 (0.7440)  grad_norm: 0.3729 (0.4156)  time: 5.5699  data: 0.0001  max mem: 71357
+[23:45:33.361270] Epoch: [3]  [100/6500]  lr: 0.000016  closs: 0.6504 (0.7386)  grad_norm: 0.3673 (0.4159)  time: 5.5702  data: 0.0001  max mem: 71357
+[23:46:29.167607] Epoch: [3]  [110/6500]  lr: 0.000016  closs: 0.7165 (0.7374)  grad_norm: 0.3846 (0.4321)  time: 5.5766  data: 0.0001  max mem: 71357
+[23:47:25.114397] Epoch: [3]  [120/6500]  lr: 0.000016  closs: 0.7398 (0.7371)  grad_norm: 0.4167 (0.4296)  time: 5.5876  data: 0.0001  max mem: 71357
+[23:48:20.766418] Epoch: [3]  [130/6500]  lr: 0.000016  closs: 0.7793 (0.7405)  grad_norm: 0.4167 (0.4313)  time: 5.5798  data: 0.0001  max mem: 71357
+[23:49:16.432372] Epoch: [3]  [140/6500]  lr: 0.000016  closs: 0.7152 (0.7344)  grad_norm: 0.3969 (0.4370)  time: 5.5658  data: 0.0001  max mem: 71357
+[23:50:12.174575] Epoch: [3]  [150/6500]  lr: 0.000016  closs: 0.7152 (0.7377)  grad_norm: 0.3969 (0.4372)  time: 5.5703  data: 0.0001  max mem: 71357
+[23:51:08.059066] Epoch: [3]  [160/6500]  lr: 0.000016  closs: 0.7234 (0.7331)  grad_norm: 0.3990 (0.4364)  time: 5.5813  data: 0.0001  max mem: 71357
+[23:52:03.804257] Epoch: [3]  [170/6500]  lr: 0.000016  closs: 0.6501 (0.7310)  grad_norm: 0.4371 (0.4405)  time: 5.5814  data: 0.0002  max mem: 71357
+[23:52:59.500243] Epoch: [3]  [180/6500]  lr: 0.000016  closs: 0.7634 (0.7355)  grad_norm: 0.4371 (0.4403)  time: 5.5719  data: 0.0002  max mem: 71357
+[23:53:55.305535] Epoch: [3]  [190/6500]  lr: 0.000016  closs: 0.7560 (0.7352)  grad_norm: 0.4015 (0.4372)  time: 5.5749  data: 0.0001  max mem: 71357
+[23:54:51.144563] Epoch: [3]  [200/6500]  lr: 0.000016  closs: 0.7020 (0.7311)  grad_norm: 0.4015 (0.4360)  time: 5.5821  data: 0.0001  max mem: 71357
+[23:55:46.998394] Epoch: [3]  [210/6500]  lr: 0.000016  closs: 0.7057 (0.7326)  grad_norm: 0.3800 (0.4397)  time: 5.5846  data: 0.0001  max mem: 71357
+[23:56:42.733975] Epoch: [3]  [220/6500]  lr: 0.000016  closs: 0.7198 (0.7340)  grad_norm: 0.3790 (0.4489)  time: 5.5794  data: 0.0001  max mem: 71357
+[23:57:38.456914] Epoch: [3]  [230/6500]  lr: 0.000016  closs: 0.6877 (0.7321)  grad_norm: 0.3800 (0.4482)  time: 5.5729  data: 0.0001  max mem: 71357
+[23:58:34.352787] Epoch: [3]  [240/6500]  lr: 0.000016  closs: 0.7389 (0.7350)  grad_norm: 0.3748 (0.4443)  time: 5.5809  data: 0.0001  max mem: 71357
+[23:59:30.047396] Epoch: [3]  [250/6500]  lr: 0.000015  closs: 0.7643 (0.7352)  grad_norm: 0.3739 (0.4411)  time: 5.5794  data: 0.0001  max mem: 71357
+[00:00:25.716985] Epoch: [3]  [260/6500]  lr: 0.000015  closs: 0.7649 (0.7387)  grad_norm: 0.3739 (0.4420)  time: 5.5681  data: 0.0001  max mem: 71357
+[00:01:21.437902] Epoch: [3]  [270/6500]  lr: 0.000015  closs: 0.7609 (0.7394)  grad_norm: 0.3747 (0.4463)  time: 5.5695  data: 0.0001  max mem: 71357
+[00:02:17.206931] Epoch: [3]  [280/6500]  lr: 0.000015  closs: 0.6993 (0.7386)  grad_norm: 0.3980 (0.4447)  time: 5.5744  data: 0.0001  max mem: 71357
+[00:03:12.921880] Epoch: [3]  [290/6500]  lr: 0.000015  closs: 0.7272 (0.7393)  grad_norm: 0.4024 (0.4441)  time: 5.5741  data: 0.0001  max mem: 71357
+[00:04:08.682771] Epoch: [3]  [300/6500]  lr: 0.000015  closs: 0.7557 (0.7407)  grad_norm: 0.4064 (0.4482)  time: 5.5737  data: 0.0001  max mem: 71357
+[00:05:04.350866] Epoch: [3]  [310/6500]  lr: 0.000015  closs: 0.7481 (0.7401)  grad_norm: 0.4064 (0.4465)  time: 5.5713  data: 0.0001  max mem: 71357
+[00:06:00.086395] Epoch: [3]  [320/6500]  lr: 0.000015  closs: 0.7440 (0.7390)  grad_norm: 0.3864 (0.4437)  time: 5.5701  data: 0.0001  max mem: 71357
+[00:06:55.945079] Epoch: [3]  [330/6500]  lr: 0.000015  closs: 0.7742 (0.7414)  grad_norm: 0.3987 (0.4443)  time: 5.5796  data: 0.0001  max mem: 71357
+[00:07:51.657783] Epoch: [3]  [340/6500]  lr: 0.000015  closs: 0.7810 (0.7444)  grad_norm: 0.3864 (0.4482)  time: 5.5785  data: 0.0001  max mem: 71357
+[00:08:47.398301] Epoch: [3]  [350/6500]  lr: 0.000015  closs: 0.7696 (0.7429)  grad_norm: 0.3989 (0.4466)  time: 5.5725  data: 0.0001  max mem: 71357
+[00:09:43.161770] Epoch: [3]  [360/6500]  lr: 0.000015  closs: 0.7516 (0.7433)  grad_norm: 0.4093 (0.4457)  time: 5.5751  data: 0.0001  max mem: 71357
+[00:10:38.853854] Epoch: [3]  [370/6500]  lr: 0.000015  closs: 0.7676 (0.7444)  grad_norm: 0.4122 (0.4464)  time: 5.5727  data: 0.0001  max mem: 71357
+[00:11:34.646934] Epoch: [3]  [380/6500]  lr: 0.000015  closs: 0.7893 (0.7451)  grad_norm: 0.3989 (0.4464)  time: 5.5742  data: 0.0001  max mem: 71357
+[00:12:30.405911] Epoch: [3]  [390/6500]  lr: 0.000015  closs: 0.8063 (0.7453)  grad_norm: 0.4085 (0.4461)  time: 5.5775  data: 0.0001  max mem: 71357
+[00:13:26.046470] Epoch: [3]  [400/6500]  lr: 0.000015  closs: 0.7670 (0.7463)  grad_norm: 0.4090 (0.4495)  time: 5.5699  data: 0.0001  max mem: 71357
+[00:14:21.747482] Epoch: [3]  [410/6500]  lr: 0.000015  closs: 0.7218 (0.7445)  grad_norm: 0.3901 (0.4504)  time: 5.5670  data: 0.0001  max mem: 71357
+[00:15:17.668173] Epoch: [3]  [420/6500]  lr: 0.000015  closs: 0.6873 (0.7438)  grad_norm: 0.3901 (0.4491)  time: 5.5810  data: 0.0001  max mem: 71357
+[00:16:13.463723] Epoch: [3]  [430/6500]  lr: 0.000015  closs: 0.7053 (0.7430)  grad_norm: 0.3729 (0.4473)  time: 5.5857  data: 0.0001  max mem: 71357
+[00:17:09.143060] Epoch: [3]  [440/6500]  lr: 0.000015  closs: 0.7145 (0.7429)  grad_norm: 0.3853 (0.4475)  time: 5.5737  data: 0.0001  max mem: 71357
+[00:18:04.852034] Epoch: [3]  [450/6500]  lr: 0.000015  closs: 0.7834 (0.7436)  grad_norm: 0.3907 (0.4470)  time: 5.5693  data: 0.0001  max mem: 71357
+[00:19:00.721499] Epoch: [3]  [460/6500]  lr: 0.000015  closs: 0.7946 (0.7431)  grad_norm: 0.3853 (0.4447)  time: 5.5788  data: 0.0001  max mem: 71357
+[00:19:56.460552] Epoch: [3]  [470/6500]  lr: 0.000015  closs: 0.7314 (0.7425)  grad_norm: 0.3587 (0.4440)  time: 5.5803  data: 0.0001  max mem: 71357
+[00:20:52.121137] Epoch: [3]  [480/6500]  lr: 0.000015  closs: 0.7648 (0.7444)  grad_norm: 0.3587 (0.4444)  time: 5.5699  data: 0.0001  max mem: 71357
+[00:21:47.795418] Epoch: [3]  [490/6500]  lr: 0.000015  closs: 0.7545 (0.7415)  grad_norm: 0.3587 (0.4470)  time: 5.5667  data: 0.0001  max mem: 71357
+[00:22:43.610741] Epoch: [3]  [500/6500]  lr: 0.000015  closs: 0.6561 (0.7423)  grad_norm: 0.3719 (0.4456)  time: 5.5744  data: 0.0001  max mem: 71357
+[00:23:39.289884] Epoch: [3]  [510/6500]  lr: 0.000015  closs: 0.7800 (0.7433)  grad_norm: 0.4030 (0.4467)  time: 5.5746  data: 0.0001  max mem: 71357
+[00:24:34.955681] Epoch: [3]  [520/6500]  lr: 0.000015  closs: 0.7182 (0.7408)  grad_norm: 0.3815 (0.4456)  time: 5.5672  data: 0.0001  max mem: 71357
+[00:25:30.752698] Epoch: [3]  [530/6500]  lr: 0.000015  closs: 0.7124 (0.7414)  grad_norm: 0.3815 (0.4451)  time: 5.5730  data: 0.0001  max mem: 71357
+[00:26:26.458444] Epoch: [3]  [540/6500]  lr: 0.000015  closs: 0.7216 (0.7405)  grad_norm: 0.3815 (0.4445)  time: 5.5750  data: 0.0001  max mem: 71357
+[00:27:22.282296] Epoch: [3]  [550/6500]  lr: 0.000015  closs: 0.7084 (0.7406)  grad_norm: 0.3815 (0.4461)  time: 5.5764  data: 0.0001  max mem: 71357
+[00:28:17.976928] Epoch: [3]  [560/6500]  lr: 0.000015  closs: 0.7254 (0.7403)  grad_norm: 0.4211 (0.4493)  time: 5.5759  data: 0.0001  max mem: 71357
+[00:29:13.811494] Epoch: [3]  [570/6500]  lr: 0.000015  closs: 0.6689 (0.7397)  grad_norm: 0.3693 (0.4479)  time: 5.5764  data: 0.0001  max mem: 71357
+[00:30:09.537691] Epoch: [3]  [580/6500]  lr: 0.000014  closs: 0.6779 (0.7394)  grad_norm: 0.4004 (0.4494)  time: 5.5779  data: 0.0001  max mem: 71357
+[00:31:05.357551] Epoch: [3]  [590/6500]  lr: 0.000014  closs: 0.7062 (0.7387)  grad_norm: 0.4039 (0.4512)  time: 5.5772  data: 0.0001  max mem: 71357
+[00:32:01.086703] Epoch: [3]  [600/6500]  lr: 0.000014  closs: 0.6869 (0.7392)  grad_norm: 0.4087 (0.4528)  time: 5.5774  data: 0.0001  max mem: 71357
+[00:32:56.729576] Epoch: [3]  [610/6500]  lr: 0.000014  closs: 0.6827 (0.7380)  grad_norm: 0.5076 (0.4546)  time: 5.5685  data: 0.0001  max mem: 71357
+[00:33:52.466776] Epoch: [3]  [620/6500]  lr: 0.000014  closs: 0.6962 (0.7388)  grad_norm: 0.5076 (0.4552)  time: 5.5689  data: 0.0001  max mem: 71357
+[00:34:48.197321] Epoch: [3]  [630/6500]  lr: 0.000014  closs: 0.7863 (0.7393)  grad_norm: 0.5055 (0.4563)  time: 5.5733  data: 0.0001  max mem: 71357
+[00:35:43.932519] Epoch: [3]  [640/6500]  lr: 0.000014  closs: 0.7812 (0.7391)  grad_norm: 0.4378 (0.4563)  time: 5.5732  data: 0.0001  max mem: 71357
+[00:36:39.649858] Epoch: [3]  [650/6500]  lr: 0.000014  closs: 0.7366 (0.7392)  grad_norm: 0.4040 (0.4562)  time: 5.5725  data: 0.0001  max mem: 71357
+[00:37:35.318846] Epoch: [3]  [660/6500]  lr: 0.000014  closs: 0.6963 (0.7391)  grad_norm: 0.4330 (0.4559)  time: 5.5692  data: 0.0001  max mem: 71357
+[00:38:30.992475] Epoch: [3]  [670/6500]  lr: 0.000014  closs: 0.7026 (0.7402)  grad_norm: 0.4156 (0.4553)  time: 5.5670  data: 0.0001  max mem: 71357
+[00:39:26.871070] Epoch: [3]  [680/6500]  lr: 0.000014  closs: 0.7783 (0.7407)  grad_norm: 0.4330 (0.4568)  time: 5.5775  data: 0.0001  max mem: 71357
+[00:40:22.585682] Epoch: [3]  [690/6500]  lr: 0.000014  closs: 0.7404 (0.7394)  grad_norm: 0.4211 (0.4559)  time: 5.5795  data: 0.0001  max mem: 71357
+[00:41:18.283696] Epoch: [3]  [700/6500]  lr: 0.000014  closs: 0.7099 (0.7398)  grad_norm: 0.4050 (0.4552)  time: 5.5705  data: 0.0001  max mem: 71357
+[00:42:14.020162] Epoch: [3]  [710/6500]  lr: 0.000014  closs: 0.7389 (0.7400)  grad_norm: 0.4050 (0.4566)  time: 5.5716  data: 0.0002  max mem: 71357
+[00:43:09.826888] Epoch: [3]  [720/6500]  lr: 0.000014  closs: 0.7389 (0.7399)  grad_norm: 0.3828 (0.4553)  time: 5.5771  data: 0.0002  max mem: 71357
+[00:44:05.589010] Epoch: [3]  [730/6500]  lr: 0.000014  closs: 0.7528 (0.7405)  grad_norm: 0.4084 (0.4581)  time: 5.5784  data: 0.0001  max mem: 71357
+[00:45:01.263164] Epoch: [3]  [740/6500]  lr: 0.000014  closs: 0.7142 (0.7401)  grad_norm: 0.4105 (0.4583)  time: 5.5717  data: 0.0001  max mem: 71357
+[00:45:57.041552] Epoch: [3]  [750/6500]  lr: 0.000014  closs: 0.7161 (0.7410)  grad_norm: 0.4105 (0.4583)  time: 5.5725  data: 0.0001  max mem: 71357
+[00:46:52.780365] Epoch: [3]  [760/6500]  lr: 0.000014  closs: 0.7962 (0.7404)  grad_norm: 0.4450 (0.4576)  time: 5.5758  data: 0.0001  max mem: 71357
+[00:47:48.538095] Epoch: [3]  [770/6500]  lr: 0.000014  closs: 0.7276 (0.7410)  grad_norm: 0.4256 (0.4577)  time: 5.5747  data: 0.0001  max mem: 71357
+[00:48:44.243706] Epoch: [3]  [780/6500]  lr: 0.000014  closs: 0.7184 (0.7401)  grad_norm: 0.4427 (0.4580)  time: 5.5731  data: 0.0001  max mem: 71357
+[00:49:39.903887] Epoch: [3]  [790/6500]  lr: 0.000014  closs: 0.6724 (0.7390)  grad_norm: 0.4518 (0.4593)  time: 5.5682  data: 0.0001  max mem: 71357
+[00:50:35.705245] Epoch: [3]  [800/6500]  lr: 0.000014  closs: 0.6724 (0.7385)  grad_norm: 0.4518 (0.4585)  time: 5.5730  data: 0.0001  max mem: 71357
+[00:51:31.514197] Epoch: [3]  [810/6500]  lr: 0.000014  closs: 0.6940 (0.7379)  grad_norm: 0.4427 (0.4582)  time: 5.5804  data: 0.0001  max mem: 71357
+[00:52:27.324671] Epoch: [3]  [820/6500]  lr: 0.000014  closs: 0.7296 (0.7380)  grad_norm: 0.4293 (0.4602)  time: 5.5809  data: 0.0001  max mem: 71357
+[00:53:23.091518] Epoch: [3]  [830/6500]  lr: 0.000014  closs: 0.6945 (0.7383)  grad_norm: 0.4142 (0.4598)  time: 5.5788  data: 0.0001  max mem: 71357
+[00:54:18.874645] Epoch: [3]  [840/6500]  lr: 0.000014  closs: 0.7030 (0.7381)  grad_norm: 0.3834 (0.4588)  time: 5.5774  data: 0.0001  max mem: 71357
+[00:55:14.607446] Epoch: [3]  [850/6500]  lr: 0.000014  closs: 0.7228 (0.7374)  grad_norm: 0.3834 (0.4594)  time: 5.5757  data: 0.0001  max mem: 71357
+[00:56:10.454988] Epoch: [3]  [860/6500]  lr: 0.000014  closs: 0.7877 (0.7379)  grad_norm: 0.3806 (0.4584)  time: 5.5789  data: 0.0001  max mem: 71357
+[00:57:06.203850] Epoch: [3]  [870/6500]  lr: 0.000014  closs: 0.7904 (0.7391)  grad_norm: 0.3834 (0.4584)  time: 5.5797  data: 0.0002  max mem: 71357
+[00:58:01.896877] Epoch: [3]  [880/6500]  lr: 0.000014  closs: 0.7406 (0.7385)  grad_norm: 0.4107 (0.4582)  time: 5.5720  data: 0.0002  max mem: 71357
+[00:58:57.712138] Epoch: [3]  [890/6500]  lr: 0.000014  closs: 0.7278 (0.7389)  grad_norm: 0.3963 (0.4577)  time: 5.5753  data: 0.0001  max mem: 71357
+[00:59:53.613174] Epoch: [3]  [900/6500]  lr: 0.000014  closs: 0.7661 (0.7382)  grad_norm: 0.4107 (0.4574)  time: 5.5857  data: 0.0001  max mem: 71357
+[01:00:49.265294] Epoch: [3]  [910/6500]  lr: 0.000014  closs: 0.7247 (0.7374)  grad_norm: 0.4342 (0.4595)  time: 5.5776  data: 0.0001  max mem: 71357
+[01:01:44.937344] Epoch: [3]  [920/6500]  lr: 0.000013  closs: 0.6816 (0.7375)  grad_norm: 0.4522 (0.4598)  time: 5.5661  data: 0.0001  max mem: 71357
+[01:02:40.641403] Epoch: [3]  [930/6500]  lr: 0.000013  closs: 0.7064 (0.7377)  grad_norm: 0.4679 (0.4599)  time: 5.5687  data: 0.0002  max mem: 71357
+[01:03:36.455103] Epoch: [3]  [940/6500]  lr: 0.000013  closs: 0.6902 (0.7376)  grad_norm: 0.4658 (0.4596)  time: 5.5758  data: 0.0002  max mem: 71357
+[01:04:32.206716] Epoch: [3]  [950/6500]  lr: 0.000013  closs: 0.7056 (0.7373)  grad_norm: 0.4290 (0.4587)  time: 5.5781  data: 0.0001  max mem: 71357
+[01:05:27.962419] Epoch: [3]  [960/6500]  lr: 0.000013  closs: 0.7121 (0.7371)  grad_norm: 0.4165 (0.4583)  time: 5.5753  data: 0.0001  max mem: 71357
+[01:06:23.683254] Epoch: [3]  [970/6500]  lr: 0.000013  closs: 0.7357 (0.7372)  grad_norm: 0.3924 (0.4586)  time: 5.5737  data: 0.0001  max mem: 71357
+[01:07:19.353981] Epoch: [3]  [980/6500]  lr: 0.000013  closs: 0.7258 (0.7370)  grad_norm: 0.3924 (0.4584)  time: 5.5695  data: 0.0002  max mem: 71357
+[01:08:15.129240] Epoch: [3]  [990/6500]  lr: 0.000013  closs: 0.7258 (0.7366)  grad_norm: 0.4066 (0.4590)  time: 5.5722  data: 0.0002  max mem: 71357
+[01:09:10.838789] Epoch: [3]  [1000/6500]  lr: 0.000013  closs: 0.7418 (0.7373)  grad_norm: 0.3735 (0.4601)  time: 5.5741  data: 0.0001  max mem: 71357
+[01:10:06.563829] Epoch: [3]  [1010/6500]  lr: 0.000013  closs: 0.7381 (0.7374)  grad_norm: 0.3834 (0.4599)  time: 5.5716  data: 0.0001  max mem: 71357
+[01:11:02.258175] Epoch: [3]  [1020/6500]  lr: 0.000013  closs: 0.7057 (0.7374)  grad_norm: 0.4172 (0.4609)  time: 5.5709  data: 0.0001  max mem: 71357
+[01:11:58.103183] Epoch: [3]  [1030/6500]  lr: 0.000013  closs: 0.7993 (0.7380)  grad_norm: 0.4857 (0.4617)  time: 5.5769  data: 0.0001  max mem: 71357
+[01:12:53.782290] Epoch: [3]  [1040/6500]  lr: 0.000013  closs: 0.7572 (0.7380)  grad_norm: 0.4870 (0.4617)  time: 5.5761  data: 0.0001  max mem: 71357
+[01:13:49.475418] Epoch: [3]  [1050/6500]  lr: 0.000013  closs: 0.7557 (0.7383)  grad_norm: 0.4931 (0.4618)  time: 5.5685  data: 0.0001  max mem: 71357
+[01:14:45.267483] Epoch: [3]  [1060/6500]  lr: 0.000013  closs: 0.7168 (0.7382)  grad_norm: 0.4857 (0.4613)  time: 5.5742  data: 0.0001  max mem: 71357
+[01:15:41.058202] Epoch: [3]  [1070/6500]  lr: 0.000013  closs: 0.7480 (0.7385)  grad_norm: 0.4244 (0.4607)  time: 5.5791  data: 0.0001  max mem: 71357
+[01:16:36.972671] Epoch: [3]  [1080/6500]  lr: 0.000013  closs: 0.7299 (0.7387)  grad_norm: 0.3912 (0.4601)  time: 5.5852  data: 0.0001  max mem: 71357
+[01:17:32.590345] Epoch: [3]  [1090/6500]  lr: 0.000013  closs: 0.6990 (0.7382)  grad_norm: 0.3655 (0.4591)  time: 5.5765  data: 0.0001  max mem: 71357
+[01:18:28.253054] Epoch: [3]  [1100/6500]  lr: 0.000013  closs: 0.6832 (0.7377)  grad_norm: 0.3655 (0.4589)  time: 5.5639  data: 0.0001  max mem: 71357
+[01:19:23.978321] Epoch: [3]  [1110/6500]  lr: 0.000013  closs: 0.7671 (0.7381)  grad_norm: 0.3668 (0.4587)  time: 5.5693  data: 0.0001  max mem: 71357
+[01:20:19.761158] Epoch: [3]  [1120/6500]  lr: 0.000013  closs: 0.7495 (0.7377)  grad_norm: 0.3668 (0.4587)  time: 5.5753  data: 0.0001  max mem: 71357
+[01:21:15.337249] Epoch: [3]  [1130/6500]  lr: 0.000013  closs: 0.7288 (0.7378)  grad_norm: 0.4154 (0.4584)  time: 5.5679  data: 0.0001  max mem: 71357
+[01:22:11.125607] Epoch: [3]  [1140/6500]  lr: 0.000013  closs: 0.6722 (0.7376)  grad_norm: 0.3753 (0.4582)  time: 5.5682  data: 0.0001  max mem: 71357
+[01:23:06.785225] Epoch: [3]  [1150/6500]  lr: 0.000013  closs: 0.6987 (0.7377)  grad_norm: 0.3753 (0.4577)  time: 5.5723  data: 0.0001  max mem: 71357
+[01:24:02.527031] Epoch: [3]  [1160/6500]  lr: 0.000013  closs: 0.7018 (0.7375)  grad_norm: 0.4078 (0.4575)  time: 5.5700  data: 0.0001  max mem: 71357
+[01:24:58.344508] Epoch: [3]  [1170/6500]  lr: 0.000013  closs: 0.7331 (0.7380)  grad_norm: 0.3751 (0.4578)  time: 5.5779  data: 0.0001  max mem: 71357
+[01:25:53.986697] Epoch: [3]  [1180/6500]  lr: 0.000013  closs: 0.7686 (0.7380)  grad_norm: 0.4078 (0.4579)  time: 5.5729  data: 0.0001  max mem: 71357
+[01:26:49.704392] Epoch: [3]  [1190/6500]  lr: 0.000013  closs: 0.6941 (0.7379)  grad_norm: 0.4060 (0.4574)  time: 5.5679  data: 0.0001  max mem: 71357
+[01:27:45.361863] Epoch: [3]  [1200/6500]  lr: 0.000013  closs: 0.7082 (0.7385)  grad_norm: 0.4060 (0.4577)  time: 5.5686  data: 0.0001  max mem: 71357
+[01:28:41.177916] Epoch: [3]  [1210/6500]  lr: 0.000013  closs: 0.7895 (0.7388)  grad_norm: 0.4182 (0.4589)  time: 5.5736  data: 0.0001  max mem: 71357
+[01:29:36.933677] Epoch: [3]  [1220/6500]  lr: 0.000013  closs: 0.7895 (0.7386)  grad_norm: 0.4473 (0.4588)  time: 5.5785  data: 0.0001  max mem: 71357
+[01:30:32.611778] Epoch: [3]  [1230/6500]  lr: 0.000013  closs: 0.6651 (0.7379)  grad_norm: 0.4473 (0.4584)  time: 5.5716  data: 0.0001  max mem: 71357
+[01:31:28.246111] Epoch: [3]  [1240/6500]  lr: 0.000013  closs: 0.7076 (0.7379)  grad_norm: 0.4307 (0.4586)  time: 5.5656  data: 0.0001  max mem: 71357
+[01:32:24.161221] Epoch: [3]  [1250/6500]  lr: 0.000013  closs: 0.7297 (0.7382)  grad_norm: 0.4281 (0.4581)  time: 5.5774  data: 0.0001  max mem: 71357
+[01:33:19.939621] Epoch: [3]  [1260/6500]  lr: 0.000013  closs: 0.7090 (0.7379)  grad_norm: 0.3789 (0.4573)  time: 5.5846  data: 0.0001  max mem: 71357
+[01:34:15.687899] Epoch: [3]  [1270/6500]  lr: 0.000013  closs: 0.6752 (0.7378)  grad_norm: 0.3819 (0.4575)  time: 5.5762  data: 0.0001  max mem: 71357
+[01:35:11.467944] Epoch: [3]  [1280/6500]  lr: 0.000012  closs: 0.7566 (0.7384)  grad_norm: 0.3789 (0.4571)  time: 5.5763  data: 0.0001  max mem: 71357
+[01:36:07.120143] Epoch: [3]  [1290/6500]  lr: 0.000012  closs: 0.8008 (0.7388)  grad_norm: 0.3870 (0.4576)  time: 5.5715  data: 0.0001  max mem: 71357
+[01:37:02.915605] Epoch: [3]  [1300/6500]  lr: 0.000012  closs: 0.7662 (0.7389)  grad_norm: 0.4138 (0.4575)  time: 5.5723  data: 0.0001  max mem: 71357
+[01:37:58.533136] Epoch: [3]  [1310/6500]  lr: 0.000012  closs: 0.7344 (0.7395)  grad_norm: 0.4138 (0.4575)  time: 5.5706  data: 0.0001  max mem: 71357
+[01:38:54.316913] Epoch: [3]  [1320/6500]  lr: 0.000012  closs: 0.7344 (0.7396)  grad_norm: 0.4228 (0.4576)  time: 5.5700  data: 0.0001  max mem: 71357
+[01:39:50.015096] Epoch: [3]  [1330/6500]  lr: 0.000012  closs: 0.7196 (0.7398)  grad_norm: 0.4228 (0.4582)  time: 5.5740  data: 0.0001  max mem: 71357
+[01:40:45.918757] Epoch: [3]  [1340/6500]  lr: 0.000012  closs: 0.7234 (0.7402)  grad_norm: 0.4203 (0.4577)  time: 5.5800  data: 0.0001  max mem: 71357
+[01:41:41.617278] Epoch: [3]  [1350/6500]  lr: 0.000012  closs: 0.7311 (0.7401)  grad_norm: 0.4203 (0.4579)  time: 5.5800  data: 0.0001  max mem: 71357
+[01:42:37.361572] Epoch: [3]  [1360/6500]  lr: 0.000012  closs: 0.6839 (0.7396)  grad_norm: 0.4300 (0.4578)  time: 5.5721  data: 0.0001  max mem: 71357
+[01:43:33.087776] Epoch: [3]  [1370/6500]  lr: 0.000012  closs: 0.7108 (0.7402)  grad_norm: 0.4203 (0.4580)  time: 5.5734  data: 0.0001  max mem: 71357
+[01:44:28.900787] Epoch: [3]  [1380/6500]  lr: 0.000012  closs: 0.8245 (0.7409)  grad_norm: 0.4237 (0.4579)  time: 5.5769  data: 0.0001  max mem: 71357
+[01:45:24.573697] Epoch: [3]  [1390/6500]  lr: 0.000012  closs: 0.7725 (0.7407)  grad_norm: 0.4237 (0.4583)  time: 5.5742  data: 0.0001  max mem: 71357
+[01:46:20.312427] Epoch: [3]  [1400/6500]  lr: 0.000012  closs: 0.7474 (0.7410)  grad_norm: 0.4414 (0.4586)  time: 5.5705  data: 0.0001  max mem: 71357
+[01:47:16.093990] Epoch: [3]  [1410/6500]  lr: 0.000012  closs: 0.7474 (0.7411)  grad_norm: 0.4410 (0.4586)  time: 5.5759  data: 0.0001  max mem: 71357
+[01:48:11.816746] Epoch: [3]  [1420/6500]  lr: 0.000012  closs: 0.7565 (0.7411)  grad_norm: 0.4414 (0.4585)  time: 5.5751  data: 0.0001  max mem: 71357
+[01:49:07.716266] Epoch: [3]  [1430/6500]  lr: 0.000012  closs: 0.7565 (0.7411)  grad_norm: 0.4410 (0.4582)  time: 5.5810  data: 0.0001  max mem: 71357
+[01:50:03.388723] Epoch: [3]  [1440/6500]  lr: 0.000012  closs: 0.7284 (0.7415)  grad_norm: 0.4376 (0.4584)  time: 5.5785  data: 0.0001  max mem: 71357
+[01:50:59.024566] Epoch: [3]  [1450/6500]  lr: 0.000012  closs: 0.7738 (0.7418)  grad_norm: 0.4376 (0.4589)  time: 5.5653  data: 0.0001  max mem: 71357
+[01:51:54.723070] Epoch: [3]  [1460/6500]  lr: 0.000012  closs: 0.7617 (0.7418)  grad_norm: 0.4371 (0.4589)  time: 5.5667  data: 0.0001  max mem: 71357
+[01:52:50.538357] Epoch: [3]  [1470/6500]  lr: 0.000012  closs: 0.6811 (0.7416)  grad_norm: 0.4182 (0.4585)  time: 5.5756  data: 0.0001  max mem: 71357
+[01:53:46.361051] Epoch: [3]  [1480/6500]  lr: 0.000012  closs: 0.7539 (0.7417)  grad_norm: 0.4128 (0.4580)  time: 5.5818  data: 0.0001  max mem: 71357
+[01:54:42.096154] Epoch: [3]  [1490/6500]  lr: 0.000012  closs: 0.7626 (0.7418)  grad_norm: 0.3807 (0.4579)  time: 5.5778  data: 0.0001  max mem: 71357
+[01:55:37.782445] Epoch: [3]  [1500/6500]  lr: 0.000012  closs: 0.7505 (0.7420)  grad_norm: 0.3785 (0.4578)  time: 5.5710  data: 0.0001  max mem: 71357
+[01:56:33.437642] Epoch: [3]  [1510/6500]  lr: 0.000012  closs: 0.7306 (0.7415)  grad_norm: 0.3785 (0.4576)  time: 5.5670  data: 0.0001  max mem: 71357
+[01:57:29.352279] Epoch: [3]  [1520/6500]  lr: 0.000012  closs: 0.6655 (0.7410)  grad_norm: 0.3785 (0.4571)  time: 5.5784  data: 0.0001  max mem: 71357
+[01:58:24.994944] Epoch: [3]  [1530/6500]  lr: 0.000012  closs: 0.6983 (0.7410)  grad_norm: 0.4127 (0.4581)  time: 5.5778  data: 0.0001  max mem: 71357
+[01:59:20.733197] Epoch: [3]  [1540/6500]  lr: 0.000012  closs: 0.6983 (0.7407)  grad_norm: 0.4055 (0.4574)  time: 5.5690  data: 0.0001  max mem: 71357
+[02:00:16.503116] Epoch: [3]  [1550/6500]  lr: 0.000012  closs: 0.7581 (0.7409)  grad_norm: 0.3975 (0.4673)  time: 5.5753  data: 0.0001  max mem: 71357
+[02:01:12.253133] Epoch: [3]  [1560/6500]  lr: 0.000012  closs: 0.7520 (0.7410)  grad_norm: 0.4127 (0.4677)  time: 5.5759  data: 0.0001  max mem: 71357
+[02:02:08.003585] Epoch: [3]  [1570/6500]  lr: 0.000012  closs: 0.7520 (0.7412)  grad_norm: 0.3975 (0.4677)  time: 5.5749  data: 0.0001  max mem: 71357
+[02:03:03.622228] Epoch: [3]  [1580/6500]  lr: 0.000012  closs: 0.7429 (0.7413)  grad_norm: 0.4225 (0.4673)  time: 5.5684  data: 0.0001  max mem: 71357
+[02:03:59.278490] Epoch: [3]  [1590/6500]  lr: 0.000012  closs: 0.6785 (0.7412)  grad_norm: 0.4574 (0.4672)  time: 5.5637  data: 0.0001  max mem: 71357
+[02:04:54.995154] Epoch: [3]  [1600/6500]  lr: 0.000012  closs: 0.7179 (0.7411)  grad_norm: 0.4226 (0.4683)  time: 5.5686  data: 0.0001  max mem: 71357
+[02:05:50.719879] Epoch: [3]  [1610/6500]  lr: 0.000012  closs: 0.7179 (0.7411)  grad_norm: 0.4166 (0.4696)  time: 5.5720  data: 0.0001  max mem: 71357
+[02:06:46.372165] Epoch: [3]  [1620/6500]  lr: 0.000012  closs: 0.7214 (0.7412)  grad_norm: 0.4342 (0.4695)  time: 5.5688  data: 0.0001  max mem: 71357
+[02:07:42.042038] Epoch: [3]  [1630/6500]  lr: 0.000012  closs: 0.7214 (0.7415)  grad_norm: 0.4226 (0.4692)  time: 5.5660  data: 0.0001  max mem: 71357
+[02:08:37.707675] Epoch: [3]  [1640/6500]  lr: 0.000012  closs: 0.6832 (0.7413)  grad_norm: 0.4152 (0.4688)  time: 5.5667  data: 0.0001  max mem: 71357
+[02:09:33.583277] Epoch: [3]  [1650/6500]  lr: 0.000012  closs: 0.7236 (0.7416)  grad_norm: 0.3965 (0.4683)  time: 5.5770  data: 0.0001  max mem: 71357
+[02:10:29.261319] Epoch: [3]  [1660/6500]  lr: 0.000012  closs: 0.7659 (0.7420)  grad_norm: 0.3933 (0.4688)  time: 5.5776  data: 0.0001  max mem: 71357
+[02:11:25.002648] Epoch: [3]  [1670/6500]  lr: 0.000011  closs: 0.7812 (0.7422)  grad_norm: 0.3842 (0.4682)  time: 5.5709  data: 0.0001  max mem: 71357
+[02:12:20.775138] Epoch: [3]  [1680/6500]  lr: 0.000011  closs: 0.7292 (0.7417)  grad_norm: 0.3713 (0.4677)  time: 5.5756  data: 0.0001  max mem: 71357
+[02:13:16.590435] Epoch: [3]  [1690/6500]  lr: 0.000011  closs: 0.7292 (0.7418)  grad_norm: 0.3856 (0.4674)  time: 5.5793  data: 0.0001  max mem: 71357
+[02:14:12.309570] Epoch: [3]  [1700/6500]  lr: 0.000011  closs: 0.7039 (0.7418)  grad_norm: 0.3970 (0.4672)  time: 5.5766  data: 0.0001  max mem: 71357
+[02:15:08.139498] Epoch: [3]  [1710/6500]  lr: 0.000011  closs: 0.6996 (0.7415)  grad_norm: 0.4101 (0.4673)  time: 5.5774  data: 0.0001  max mem: 71357
+[02:16:03.905752] Epoch: [3]  [1720/6500]  lr: 0.000011  closs: 0.7542 (0.7416)  grad_norm: 0.4146 (0.4669)  time: 5.5797  data: 0.0001  max mem: 71357
+[02:16:59.594667] Epoch: [3]  [1730/6500]  lr: 0.000011  closs: 0.7506 (0.7417)  grad_norm: 0.4146 (0.4666)  time: 5.5726  data: 0.0001  max mem: 71357
+[02:17:55.268246] Epoch: [3]  [1740/6500]  lr: 0.000011  closs: 0.6567 (0.7410)  grad_norm: 0.4084 (0.4667)  time: 5.5680  data: 0.0001  max mem: 71357
+[02:18:50.907554] Epoch: [3]  [1750/6500]  lr: 0.000011  closs: 0.6691 (0.7413)  grad_norm: 0.4212 (0.4668)  time: 5.5656  data: 0.0001  max mem: 71357
+[02:19:46.589279] Epoch: [3]  [1760/6500]  lr: 0.000011  closs: 0.6691 (0.7410)  grad_norm: 0.4253 (0.4670)  time: 5.5660  data: 0.0001  max mem: 71357
+[02:20:42.306504] Epoch: [3]  [1770/6500]  lr: 0.000011  closs: 0.6992 (0.7412)  grad_norm: 0.4253 (0.4667)  time: 5.5699  data: 0.0001  max mem: 71357
+[02:21:38.162516] Epoch: [3]  [1780/6500]  lr: 0.000011  closs: 0.7811 (0.7412)  grad_norm: 0.4221 (0.4664)  time: 5.5786  data: 0.0001  max mem: 71357
+[02:22:33.796796] Epoch: [3]  [1790/6500]  lr: 0.000011  closs: 0.8156 (0.7417)  grad_norm: 0.4038 (0.4661)  time: 5.5744  data: 0.0001  max mem: 71357
+[02:23:29.492190] Epoch: [3]  [1800/6500]  lr: 0.000011  closs: 0.8221 (0.7420)  grad_norm: 0.4221 (0.4661)  time: 5.5664  data: 0.0001  max mem: 71357
+[02:24:25.122788] Epoch: [3]  [1810/6500]  lr: 0.000011  closs: 0.7031 (0.7419)  grad_norm: 0.4221 (0.4658)  time: 5.5662  data: 0.0001  max mem: 71357
+[02:25:20.865166] Epoch: [3]  [1820/6500]  lr: 0.000011  closs: 0.7157 (0.7420)  grad_norm: 0.4443 (0.4663)  time: 5.5686  data: 0.0001  max mem: 71357
+[02:26:16.547329] Epoch: [3]  [1830/6500]  lr: 0.000011  closs: 0.7475 (0.7419)  grad_norm: 0.4443 (0.4661)  time: 5.5712  data: 0.0001  max mem: 71357
+[02:27:12.190688] Epoch: [3]  [1840/6500]  lr: 0.000011  closs: 0.7351 (0.7418)  grad_norm: 0.4383 (0.4659)  time: 5.5662  data: 0.0001  max mem: 71357
+[02:28:07.854923] Epoch: [3]  [1850/6500]  lr: 0.000011  closs: 0.7313 (0.7419)  grad_norm: 0.4383 (0.4655)  time: 5.5653  data: 0.0001  max mem: 71357
+[02:29:03.539524] Epoch: [3]  [1860/6500]  lr: 0.000011  closs: 0.7179 (0.7419)  grad_norm: 0.3883 (0.4656)  time: 5.5674  data: 0.0001  max mem: 71357
+[02:29:59.225284] Epoch: [3]  [1870/6500]  lr: 0.000011  closs: 0.7383 (0.7420)  grad_norm: 0.4725 (0.4658)  time: 5.5684  data: 0.0001  max mem: 71357
+[02:30:54.926822] Epoch: [3]  [1880/6500]  lr: 0.000011  closs: 0.7903 (0.7422)  grad_norm: 0.4160 (0.4656)  time: 5.5693  data: 0.0001  max mem: 71357
+[02:31:50.596193] Epoch: [3]  [1890/6500]  lr: 0.000011  closs: 0.7737 (0.7417)  grad_norm: 0.4482 (0.4657)  time: 5.5684  data: 0.0001  max mem: 71357
+[02:32:46.225288] Epoch: [3]  [1900/6500]  lr: 0.000011  closs: 0.7226 (0.7418)  grad_norm: 0.4482 (0.4662)  time: 5.5648  data: 0.0001  max mem: 71357
+[02:33:42.077006] Epoch: [3]  [1910/6500]  lr: 0.000011  closs: 0.7017 (0.7414)  grad_norm: 0.4312 (0.4662)  time: 5.5740  data: 0.0001  max mem: 71357
+[02:34:37.702178] Epoch: [3]  [1920/6500]  lr: 0.000011  closs: 0.6920 (0.7413)  grad_norm: 0.4146 (0.4657)  time: 5.5738  data: 0.0001  max mem: 71357
+[02:35:33.312850] Epoch: [3]  [1930/6500]  lr: 0.000011  closs: 0.7232 (0.7414)  grad_norm: 0.4084 (0.4657)  time: 5.5617  data: 0.0001  max mem: 71357
+[02:36:29.008470] Epoch: [3]  [1940/6500]  lr: 0.000011  closs: 0.7370 (0.7416)  grad_norm: 0.3967 (0.4656)  time: 5.5652  data: 0.0001  max mem: 71357
+[02:37:24.578544] Epoch: [3]  [1950/6500]  lr: 0.000011  closs: 0.6784 (0.7410)  grad_norm: 0.4146 (0.4656)  time: 5.5632  data: 0.0001  max mem: 71357
+[02:38:20.386332] Epoch: [3]  [1960/6500]  lr: 0.000011  closs: 0.7038 (0.7412)  grad_norm: 0.4360 (0.4668)  time: 5.5688  data: 0.0001  max mem: 71357
+[02:39:16.102321] Epoch: [3]  [1970/6500]  lr: 0.000011  closs: 0.7362 (0.7417)  grad_norm: 0.4360 (0.4667)  time: 5.5761  data: 0.0001  max mem: 71357
+[02:40:11.878609] Epoch: [3]  [1980/6500]  lr: 0.000011  closs: 0.7269 (0.7417)  grad_norm: 0.4407 (0.4668)  time: 5.5746  data: 0.0001  max mem: 71357
+[02:41:07.583458] Epoch: [3]  [1990/6500]  lr: 0.000011  closs: 0.7227 (0.7414)  grad_norm: 0.3940 (0.4665)  time: 5.5740  data: 0.0001  max mem: 71357
+[02:42:03.433265] Epoch: [3]  [2000/6500]  lr: 0.000011  closs: 0.6968 (0.7418)  grad_norm: 0.3859 (0.4671)  time: 5.5777  data: 0.0001  max mem: 71357
+[02:42:59.197575] Epoch: [3]  [2010/6500]  lr: 0.000011  closs: 0.7337 (0.7419)  grad_norm: 0.3948 (0.4671)  time: 5.5806  data: 0.0001  max mem: 71357
+[02:43:54.917992] Epoch: [3]  [2020/6500]  lr: 0.000011  closs: 0.7337 (0.7419)  grad_norm: 0.3868 (0.4670)  time: 5.5741  data: 0.0001  max mem: 71357
+[02:44:50.528619] Epoch: [3]  [2030/6500]  lr: 0.000011  closs: 0.7078 (0.7416)  grad_norm: 0.3948 (0.4672)  time: 5.5665  data: 0.0001  max mem: 71357
+[02:45:46.347257] Epoch: [3]  [2040/6500]  lr: 0.000011  closs: 0.7078 (0.7418)  grad_norm: 0.3819 (0.4667)  time: 5.5714  data: 0.0001  max mem: 71357
+[02:46:42.143262] Epoch: [3]  [2050/6500]  lr: 0.000011  closs: 0.8076 (0.7420)  grad_norm: 0.3655 (0.4664)  time: 5.5807  data: 0.0001  max mem: 71357
+[02:47:37.791827] Epoch: [3]  [2060/6500]  lr: 0.000011  closs: 0.7841 (0.7419)  grad_norm: 0.3819 (0.4661)  time: 5.5721  data: 0.0001  max mem: 71357
+[02:48:33.510323] Epoch: [3]  [2070/6500]  lr: 0.000010  closs: 0.7400 (0.7420)  grad_norm: 0.3773 (0.4659)  time: 5.5683  data: 0.0001  max mem: 71357
+[02:49:29.250770] Epoch: [3]  [2080/6500]  lr: 0.000010  closs: 0.7386 (0.7420)  grad_norm: 0.3972 (0.4665)  time: 5.5729  data: 0.0001  max mem: 71357
+[02:50:25.143434] Epoch: [3]  [2090/6500]  lr: 0.000010  closs: 0.7386 (0.7423)  grad_norm: 0.4165 (0.4708)  time: 5.5816  data: 0.0001  max mem: 71357
+[02:51:20.673668] Epoch: [3]  [2100/6500]  lr: 0.000010  closs: 0.7314 (0.7420)  grad_norm: 0.5068 (0.4721)  time: 5.5711  data: 0.0001  max mem: 71357
+[02:52:16.282560] Epoch: [3]  [2110/6500]  lr: 0.000010  closs: 0.7098 (0.7423)  grad_norm: 0.5068 (0.4721)  time: 5.5569  data: 0.0001  max mem: 71357
+[02:53:11.994560] Epoch: [3]  [2120/6500]  lr: 0.000010  closs: 0.7516 (0.7426)  grad_norm: 0.4518 (0.4720)  time: 5.5660  data: 0.0001  max mem: 71357
+[02:54:07.738349] Epoch: [3]  [2130/6500]  lr: 0.000010  closs: 0.7340 (0.7425)  grad_norm: 0.4506 (0.4718)  time: 5.5727  data: 0.0001  max mem: 71357
+[02:55:03.531949] Epoch: [3]  [2140/6500]  lr: 0.000010  closs: 0.6876 (0.7423)  grad_norm: 0.4036 (0.4718)  time: 5.5768  data: 0.0001  max mem: 71357
+[02:55:59.106827] Epoch: [3]  [2150/6500]  lr: 0.000010  closs: 0.6876 (0.7424)  grad_norm: 0.4139 (0.4721)  time: 5.5683  data: 0.0001  max mem: 71357
+[02:56:54.769431] Epoch: [3]  [2160/6500]  lr: 0.000010  closs: 0.7490 (0.7425)  grad_norm: 0.4139 (0.4718)  time: 5.5618  data: 0.0001  max mem: 71357
+[02:57:50.424404] Epoch: [3]  [2170/6500]  lr: 0.000010  closs: 0.6904 (0.7421)  grad_norm: 0.4139 (0.4724)  time: 5.5658  data: 0.0001  max mem: 71357
+[02:58:46.204630] Epoch: [3]  [2180/6500]  lr: 0.000010  closs: 0.7001 (0.7419)  grad_norm: 0.4364 (0.4725)  time: 5.5717  data: 0.0001  max mem: 71357
+[02:59:41.913462] Epoch: [3]  [2190/6500]  lr: 0.000010  closs: 0.7383 (0.7419)  grad_norm: 0.4115 (0.4721)  time: 5.5744  data: 0.0001  max mem: 71357
+[03:00:37.544665] Epoch: [3]  [2200/6500]  lr: 0.000010  closs: 0.7552 (0.7420)  grad_norm: 0.3997 (0.4716)  time: 5.5669  data: 0.0001  max mem: 71357
+[03:01:33.065049] Epoch: [3]  [2210/6500]  lr: 0.000010  closs: 0.7129 (0.7416)  grad_norm: 0.4201 (0.4722)  time: 5.5575  data: 0.0001  max mem: 71357
+[03:02:28.964229] Epoch: [3]  [2220/6500]  lr: 0.000010  closs: 0.7129 (0.7415)  grad_norm: 0.3805 (0.4718)  time: 5.5709  data: 0.0001  max mem: 71357
+[03:03:24.711773] Epoch: [3]  [2230/6500]  lr: 0.000010  closs: 0.7841 (0.7417)  grad_norm: 0.4201 (0.4720)  time: 5.5823  data: 0.0001  max mem: 71357
+[03:04:20.376127] Epoch: [3]  [2240/6500]  lr: 0.000010  closs: 0.7152 (0.7416)  grad_norm: 0.4451 (0.4735)  time: 5.5705  data: 0.0001  max mem: 71357
+[03:05:16.094408] Epoch: [3]  [2250/6500]  lr: 0.000010  closs: 0.7152 (0.7417)  grad_norm: 0.3921 (0.4732)  time: 5.5690  data: 0.0001  max mem: 71357
+[03:06:11.929158] Epoch: [3]  [2260/6500]  lr: 0.000010  closs: 0.7516 (0.7419)  grad_norm: 0.4235 (0.4729)  time: 5.5776  data: 0.0001  max mem: 71357
+[03:07:07.681225] Epoch: [3]  [2270/6500]  lr: 0.000010  closs: 0.7474 (0.7419)  grad_norm: 0.4103 (0.4726)  time: 5.5793  data: 0.0001  max mem: 71357
+[03:08:03.330780] Epoch: [3]  [2280/6500]  lr: 0.000010  closs: 0.7457 (0.7421)  grad_norm: 0.4028 (0.4724)  time: 5.5700  data: 0.0001  max mem: 71357
+[03:08:59.036717] Epoch: [3]  [2290/6500]  lr: 0.000010  closs: 0.7586 (0.7420)  grad_norm: 0.4277 (0.4728)  time: 5.5677  data: 0.0001  max mem: 71357
+[03:09:54.753564] Epoch: [3]  [2300/6500]  lr: 0.000010  closs: 0.7384 (0.7419)  grad_norm: 0.4449 (0.4728)  time: 5.5710  data: 0.0001  max mem: 71357
+[03:10:50.585338] Epoch: [3]  [2310/6500]  lr: 0.000010  closs: 0.6609 (0.7415)  grad_norm: 0.4297 (0.4726)  time: 5.5773  data: 0.0001  max mem: 71357
+[03:11:46.300428] Epoch: [3]  [2320/6500]  lr: 0.000010  closs: 0.6929 (0.7417)  grad_norm: 0.4388 (0.4725)  time: 5.5773  data: 0.0001  max mem: 71357
+[03:12:42.024279] Epoch: [3]  [2330/6500]  lr: 0.000010  closs: 0.7588 (0.7419)  grad_norm: 0.4257 (0.4725)  time: 5.5719  data: 0.0001  max mem: 71357
+[03:13:37.667847] Epoch: [3]  [2340/6500]  lr: 0.000010  closs: 0.7638 (0.7420)  grad_norm: 0.4236 (0.4723)  time: 5.5683  data: 0.0001  max mem: 71357
+[03:14:33.402863] Epoch: [3]  [2350/6500]  lr: 0.000010  closs: 0.7448 (0.7419)  grad_norm: 0.4299 (0.4724)  time: 5.5688  data: 0.0001  max mem: 71357
+[03:15:29.150906] Epoch: [3]  [2360/6500]  lr: 0.000010  closs: 0.6643 (0.7415)  grad_norm: 0.4257 (0.4721)  time: 5.5741  data: 0.0001  max mem: 71357
+[03:16:24.888869] Epoch: [3]  [2370/6500]  lr: 0.000010  closs: 0.6643 (0.7413)  grad_norm: 0.3967 (0.4717)  time: 5.5742  data: 0.0001  max mem: 71357
+[03:17:20.515563] Epoch: [3]  [2380/6500]  lr: 0.000010  closs: 0.6623 (0.7412)  grad_norm: 0.4207 (0.4722)  time: 5.5682  data: 0.0001  max mem: 71357
+[03:18:16.227993] Epoch: [3]  [2390/6500]  lr: 0.000010  closs: 0.7421 (0.7414)  grad_norm: 0.4128 (0.4721)  time: 5.5669  data: 0.0001  max mem: 71357
+[03:19:12.133308] Epoch: [3]  [2400/6500]  lr: 0.000010  closs: 0.6836 (0.7410)  grad_norm: 0.3865 (0.4718)  time: 5.5808  data: 0.0001  max mem: 71357
+[03:20:07.896174] Epoch: [3]  [2410/6500]  lr: 0.000010  closs: 0.6543 (0.7409)  grad_norm: 0.4128 (0.4717)  time: 5.5833  data: 0.0001  max mem: 71357
+[03:21:03.510971] Epoch: [3]  [2420/6500]  lr: 0.000010  closs: 0.7160 (0.7409)  grad_norm: 0.4059 (0.4716)  time: 5.5688  data: 0.0001  max mem: 71357
+[03:21:59.207139] Epoch: [3]  [2430/6500]  lr: 0.000010  closs: 0.7137 (0.7409)  grad_norm: 0.4059 (0.4717)  time: 5.5655  data: 0.0001  max mem: 71357
+[03:22:55.037062] Epoch: [3]  [2440/6500]  lr: 0.000010  closs: 0.6463 (0.7407)  grad_norm: 0.4322 (0.4724)  time: 5.5762  data: 0.0001  max mem: 71357
+[03:23:50.643702] Epoch: [3]  [2450/6500]  lr: 0.000010  closs: 0.6266 (0.7404)  grad_norm: 0.4504 (0.4727)  time: 5.5717  data: 0.0001  max mem: 71357
+[03:24:46.427959] Epoch: [3]  [2460/6500]  lr: 0.000010  closs: 0.6887 (0.7405)  grad_norm: 0.4398 (0.4724)  time: 5.5694  data: 0.0001  max mem: 71357
+[03:25:42.142791] Epoch: [3]  [2470/6500]  lr: 0.000010  closs: 0.7283 (0.7406)  grad_norm: 0.4504 (0.4723)  time: 5.5749  data: 0.0001  max mem: 71357
+[03:26:38.004975] Epoch: [3]  [2480/6500]  lr: 0.000010  closs: 0.6969 (0.7404)  grad_norm: 0.4335 (0.4719)  time: 5.5788  data: 0.0001  max mem: 71357
+[03:27:33.744560] Epoch: [3]  [2490/6500]  lr: 0.000010  closs: 0.6872 (0.7404)  grad_norm: 0.3993 (0.4719)  time: 5.5800  data: 0.0001  max mem: 71357
+[03:28:29.410295] Epoch: [3]  [2500/6500]  lr: 0.000010  closs: 0.6872 (0.7403)  grad_norm: 0.4124 (0.4718)  time: 5.5702  data: 0.0001  max mem: 71357
+[03:29:25.128655] Epoch: [3]  [2510/6500]  lr: 0.000009  closs: 0.7039 (0.7404)  grad_norm: 0.3874 (0.4718)  time: 5.5691  data: 0.0001  max mem: 71357
+[03:30:20.824465] Epoch: [3]  [2520/6500]  lr: 0.000009  closs: 0.7661 (0.7407)  grad_norm: 0.3943 (0.4717)  time: 5.5706  data: 0.0001  max mem: 71357
+[03:31:16.630889] Epoch: [3]  [2530/6500]  lr: 0.000009  closs: 0.7887 (0.7411)  grad_norm: 0.3874 (0.4713)  time: 5.5750  data: 0.0001  max mem: 71357
+[03:32:12.221908] Epoch: [3]  [2540/6500]  lr: 0.000009  closs: 0.7857 (0.7410)  grad_norm: 0.3943 (0.4717)  time: 5.5698  data: 0.0001  max mem: 71357
+[03:33:07.870496] Epoch: [3]  [2550/6500]  lr: 0.000009  closs: 0.7204 (0.7411)  grad_norm: 0.4102 (0.4718)  time: 5.5619  data: 0.0001  max mem: 71357
+[03:34:03.554231] Epoch: [3]  [2560/6500]  lr: 0.000009  closs: 0.7516 (0.7415)  grad_norm: 0.4711 (0.4719)  time: 5.5666  data: 0.0001  max mem: 71357
+[03:34:59.248273] Epoch: [3]  [2570/6500]  lr: 0.000009  closs: 0.7628 (0.7414)  grad_norm: 0.4896 (0.4722)  time: 5.5688  data: 0.0001  max mem: 71357
+[03:35:54.952768] Epoch: [3]  [2580/6500]  lr: 0.000009  closs: 0.6777 (0.7414)  grad_norm: 0.4469 (0.4719)  time: 5.5698  data: 0.0001  max mem: 71357
+[03:36:50.647324] Epoch: [3]  [2590/6500]  lr: 0.000009  closs: 0.7527 (0.7414)  grad_norm: 0.4279 (0.4720)  time: 5.5699  data: 0.0001  max mem: 71357
+[03:37:46.354833] Epoch: [3]  [2600/6500]  lr: 0.000009  closs: 0.7692 (0.7415)  grad_norm: 0.3699 (0.4717)  time: 5.5700  data: 0.0001  max mem: 71357
+[03:38:41.964847] Epoch: [3]  [2610/6500]  lr: 0.000009  closs: 0.7800 (0.7416)  grad_norm: 0.3615 (0.4716)  time: 5.5658  data: 0.0001  max mem: 71357
+[03:39:37.792299] Epoch: [3]  [2620/6500]  lr: 0.000009  closs: 0.6826 (0.7413)  grad_norm: 0.4102 (0.4716)  time: 5.5718  data: 0.0001  max mem: 71357
+[03:40:33.423699] Epoch: [3]  [2630/6500]  lr: 0.000009  closs: 0.7121 (0.7411)  grad_norm: 0.3668 (0.4712)  time: 5.5729  data: 0.0001  max mem: 71357
+[03:41:29.121161] Epoch: [3]  [2640/6500]  lr: 0.000009  closs: 0.7084 (0.7409)  grad_norm: 0.4153 (0.4710)  time: 5.5663  data: 0.0001  max mem: 71357
+[03:42:24.835125] Epoch: [3]  [2650/6500]  lr: 0.000009  closs: 0.7084 (0.7410)  grad_norm: 0.3918 (0.4706)  time: 5.5705  data: 0.0001  max mem: 71357
+[03:43:20.618505] Epoch: [3]  [2660/6500]  lr: 0.000009  closs: 0.7455 (0.7410)  grad_norm: 0.3730 (0.4704)  time: 5.5748  data: 0.0001  max mem: 71357
+[03:44:16.323482] Epoch: [3]  [2670/6500]  lr: 0.000009  closs: 0.7791 (0.7412)  grad_norm: 0.3868 (0.4702)  time: 5.5743  data: 0.0001  max mem: 71357
+[03:45:12.012818] Epoch: [3]  [2680/6500]  lr: 0.000009  closs: 0.7665 (0.7411)  grad_norm: 0.3774 (0.4701)  time: 5.5696  data: 0.0001  max mem: 71357
+[03:46:07.663209] Epoch: [3]  [2690/6500]  lr: 0.000009  closs: 0.6634 (0.7410)  grad_norm: 0.4084 (0.4700)  time: 5.5669  data: 0.0001  max mem: 71357
+[03:47:03.385219] Epoch: [3]  [2700/6500]  lr: 0.000009  closs: 0.7106 (0.7409)  grad_norm: 0.4270 (0.4699)  time: 5.5685  data: 0.0001  max mem: 71357
+[03:47:59.094951] Epoch: [3]  [2710/6500]  lr: 0.000009  closs: 0.6913 (0.7409)  grad_norm: 0.4162 (0.4697)  time: 5.5715  data: 0.0001  max mem: 71357
+[03:48:54.689454] Epoch: [3]  [2720/6500]  lr: 0.000009  closs: 0.6913 (0.7408)  grad_norm: 0.4328 (0.4699)  time: 5.5651  data: 0.0001  max mem: 71357
+[03:49:50.392630] Epoch: [3]  [2730/6500]  lr: 0.000009  closs: 0.7583 (0.7411)  grad_norm: 0.4328 (0.4698)  time: 5.5648  data: 0.0001  max mem: 71357
+[03:50:46.064539] Epoch: [3]  [2740/6500]  lr: 0.000009  closs: 0.7172 (0.7408)  grad_norm: 0.4328 (0.4697)  time: 5.5687  data: 0.0001  max mem: 71357
+[03:51:41.898318] Epoch: [3]  [2750/6500]  lr: 0.000009  closs: 0.7137 (0.7409)  grad_norm: 0.4655 (0.4696)  time: 5.5752  data: 0.0001  max mem: 71357
+[03:52:37.622614] Epoch: [3]  [2760/6500]  lr: 0.000009  closs: 0.7675 (0.7411)  grad_norm: 0.4293 (0.4695)  time: 5.5778  data: 0.0001  max mem: 71357
+[03:53:33.270434] Epoch: [3]  [2770/6500]  lr: 0.000009  closs: 0.7621 (0.7410)  grad_norm: 0.4293 (0.4694)  time: 5.5685  data: 0.0001  max mem: 71357
+[03:54:28.938310] Epoch: [3]  [2780/6500]  lr: 0.000009  closs: 0.7231 (0.7410)  grad_norm: 0.4198 (0.4695)  time: 5.5657  data: 0.0001  max mem: 71357
+[03:55:24.636676] Epoch: [3]  [2790/6500]  lr: 0.000009  closs: 0.7205 (0.7410)  grad_norm: 0.4177 (0.4694)  time: 5.5683  data: 0.0001  max mem: 71357
+[03:56:20.341616] Epoch: [3]  [2800/6500]  lr: 0.000009  closs: 0.7450 (0.7414)  grad_norm: 0.4198 (0.4702)  time: 5.5701  data: 0.0001  max mem: 71357
+[03:57:16.007696] Epoch: [3]  [2810/6500]  lr: 0.000009  closs: 0.7234 (0.7413)  grad_norm: 0.4308 (0.4709)  time: 5.5685  data: 0.0001  max mem: 71357
+[03:58:11.641875] Epoch: [3]  [2820/6500]  lr: 0.000009  closs: 0.6968 (0.7413)  grad_norm: 0.4058 (0.4706)  time: 5.5649  data: 0.0001  max mem: 71357
+[03:59:07.419506] Epoch: [3]  [2830/6500]  lr: 0.000009  closs: 0.7425 (0.7414)  grad_norm: 0.4037 (0.4706)  time: 5.5705  data: 0.0001  max mem: 71357
+[04:00:03.264856] Epoch: [3]  [2840/6500]  lr: 0.000009  closs: 0.7550 (0.7416)  grad_norm: 0.4037 (0.4703)  time: 5.5811  data: 0.0001  max mem: 71357
+[04:00:58.906002] Epoch: [3]  [2850/6500]  lr: 0.000009  closs: 0.7550 (0.7417)  grad_norm: 0.3923 (0.4703)  time: 5.5743  data: 0.0001  max mem: 71357
+[04:01:54.598558] Epoch: [3]  [2860/6500]  lr: 0.000009  closs: 0.7611 (0.7416)  grad_norm: 0.4231 (0.4702)  time: 5.5666  data: 0.0001  max mem: 71357
+[04:02:50.245016] Epoch: [3]  [2870/6500]  lr: 0.000009  closs: 0.7677 (0.7419)  grad_norm: 0.4231 (0.4703)  time: 5.5669  data: 0.0001  max mem: 71357
+[04:03:45.974594] Epoch: [3]  [2880/6500]  lr: 0.000009  closs: 0.7250 (0.7417)  grad_norm: 0.4318 (0.4703)  time: 5.5687  data: 0.0001  max mem: 71357
+[04:04:41.643615] Epoch: [3]  [2890/6500]  lr: 0.000009  closs: 0.7242 (0.7419)  grad_norm: 0.4211 (0.4705)  time: 5.5698  data: 0.0001  max mem: 71357
+[04:05:37.230252] Epoch: [3]  [2900/6500]  lr: 0.000009  closs: 0.7474 (0.7420)  grad_norm: 0.4166 (0.4704)  time: 5.5627  data: 0.0001  max mem: 71357
+[04:06:32.951079] Epoch: [3]  [2910/6500]  lr: 0.000009  closs: 0.7044 (0.7418)  grad_norm: 0.4166 (0.4703)  time: 5.5653  data: 0.0001  max mem: 71357
+[04:07:28.763809] Epoch: [3]  [2920/6500]  lr: 0.000009  closs: 0.6616 (0.7417)  grad_norm: 0.4041 (0.4698)  time: 5.5766  data: 0.0001  max mem: 71357
+[04:08:24.491781] Epoch: [3]  [2930/6500]  lr: 0.000009  closs: 0.6237 (0.7413)  grad_norm: 0.4075 (0.4698)  time: 5.5770  data: 0.0001  max mem: 71357
+[04:09:20.165615] Epoch: [3]  [2940/6500]  lr: 0.000009  closs: 0.6831 (0.7411)  grad_norm: 0.4126 (0.4697)  time: 5.5700  data: 0.0001  max mem: 71357
+[04:10:15.890629] Epoch: [3]  [2950/6500]  lr: 0.000009  closs: 0.6921 (0.7410)  grad_norm: 0.4271 (0.4699)  time: 5.5699  data: 0.0001  max mem: 71357
+[04:11:11.666969] Epoch: [3]  [2960/6500]  lr: 0.000009  closs: 0.6946 (0.7409)  grad_norm: 0.4273 (0.4696)  time: 5.5750  data: 0.0001  max mem: 71357
+[04:12:07.462625] Epoch: [3]  [2970/6500]  lr: 0.000009  closs: 0.6784 (0.7407)  grad_norm: 0.3911 (0.4695)  time: 5.5785  data: 0.0001  max mem: 71357
+[04:13:03.238329] Epoch: [3]  [2980/6500]  lr: 0.000009  closs: 0.7087 (0.7407)  grad_norm: 0.3911 (0.4693)  time: 5.5785  data: 0.0001  max mem: 71357
+[04:13:58.891238] Epoch: [3]  [2990/6500]  lr: 0.000009  closs: 0.7087 (0.7404)  grad_norm: 0.3911 (0.4692)  time: 5.5713  data: 0.0001  max mem: 71357
+[04:14:54.493479] Epoch: [3]  [3000/6500]  lr: 0.000008  closs: 0.6673 (0.7403)  grad_norm: 0.4436 (0.4696)  time: 5.5627  data: 0.0001  max mem: 71357
+[04:15:50.321151] Epoch: [3]  [3010/6500]  lr: 0.000008  closs: 0.7433 (0.7404)  grad_norm: 0.4436 (0.4694)  time: 5.5714  data: 0.0001  max mem: 71357
+[04:16:46.107372] Epoch: [3]  [3020/6500]  lr: 0.000008  closs: 0.7592 (0.7406)  grad_norm: 0.4476 (0.4693)  time: 5.5806  data: 0.0001  max mem: 71357
+[04:17:41.774052] Epoch: [3]  [3030/6500]  lr: 0.000008  closs: 0.7118 (0.7405)  grad_norm: 0.4055 (0.4690)  time: 5.5725  data: 0.0001  max mem: 71357
+[04:18:37.357139] Epoch: [3]  [3040/6500]  lr: 0.000008  closs: 0.7045 (0.7403)  grad_norm: 0.3980 (0.4691)  time: 5.5624  data: 0.0001  max mem: 71357
+[04:19:33.179800] Epoch: [3]  [3050/6500]  lr: 0.000008  closs: 0.7116 (0.7405)  grad_norm: 0.3708 (0.4687)  time: 5.5702  data: 0.0001  max mem: 71357
+[04:20:29.004476] Epoch: [3]  [3060/6500]  lr: 0.000008  closs: 0.7054 (0.7404)  grad_norm: 0.3656 (0.4685)  time: 5.5823  data: 0.0001  max mem: 71357
+[04:21:24.716825] Epoch: [3]  [3070/6500]  lr: 0.000008  closs: 0.7146 (0.7405)  grad_norm: 0.3758 (0.4684)  time: 5.5768  data: 0.0001  max mem: 71357
+[04:22:20.360022] Epoch: [3]  [3080/6500]  lr: 0.000008  closs: 0.7148 (0.7403)  grad_norm: 0.3667 (0.4683)  time: 5.5677  data: 0.0001  max mem: 71357
+[04:23:16.035938] Epoch: [3]  [3090/6500]  lr: 0.000008  closs: 0.7148 (0.7403)  grad_norm: 0.3956 (0.4683)  time: 5.5659  data: 0.0001  max mem: 71357
+[04:24:11.851079] Epoch: [3]  [3100/6500]  lr: 0.000008  closs: 0.6789 (0.7400)  grad_norm: 0.4137 (0.4680)  time: 5.5745  data: 0.0001  max mem: 71357
+[04:25:07.496748] Epoch: [3]  [3110/6500]  lr: 0.000008  closs: 0.6925 (0.7402)  grad_norm: 0.3955 (0.4684)  time: 5.5730  data: 0.0001  max mem: 71357
+[04:26:03.184320] Epoch: [3]  [3120/6500]  lr: 0.000008  closs: 0.7409 (0.7402)  grad_norm: 0.3955 (0.4682)  time: 5.5666  data: 0.0001  max mem: 71357
+[04:26:58.826254] Epoch: [3]  [3130/6500]  lr: 0.000008  closs: 0.7133 (0.7402)  grad_norm: 0.4034 (0.4684)  time: 5.5664  data: 0.0001  max mem: 71357
+[04:27:54.567322] Epoch: [3]  [3140/6500]  lr: 0.000008  closs: 0.6893 (0.7399)  grad_norm: 0.4219 (0.4689)  time: 5.5691  data: 0.0001  max mem: 71357
+[04:28:50.349706] Epoch: [3]  [3150/6500]  lr: 0.000008  closs: 0.6893 (0.7402)  grad_norm: 0.4405 (0.4689)  time: 5.5761  data: 0.0001  max mem: 71357
+[04:29:46.023632] Epoch: [3]  [3160/6500]  lr: 0.000008  closs: 0.7973 (0.7404)  grad_norm: 0.4484 (0.4690)  time: 5.5727  data: 0.0001  max mem: 71357
+[04:30:41.687580] Epoch: [3]  [3170/6500]  lr: 0.000008  closs: 0.7909 (0.7406)  grad_norm: 0.4616 (0.4695)  time: 5.5668  data: 0.0001  max mem: 71357
+[04:31:37.298065] Epoch: [3]  [3180/6500]  lr: 0.000008  closs: 0.7909 (0.7408)  grad_norm: 0.4616 (0.4695)  time: 5.5636  data: 0.0001  max mem: 71357
+[04:32:33.030157] Epoch: [3]  [3190/6500]  lr: 0.000008  closs: 0.8091 (0.7411)  grad_norm: 0.4743 (0.4695)  time: 5.5670  data: 0.0001  max mem: 71357
+[04:33:28.658546] Epoch: [3]  [3200/6500]  lr: 0.000008  closs: 0.7844 (0.7411)  grad_norm: 0.4590 (0.4698)  time: 5.5679  data: 0.0001  max mem: 71357
+[04:34:24.435166] Epoch: [3]  [3210/6500]  lr: 0.000008  closs: 0.7599 (0.7413)  grad_norm: 0.4360 (0.4700)  time: 5.5702  data: 0.0001  max mem: 71357
+[04:35:20.170339] Epoch: [3]  [3220/6500]  lr: 0.000008  closs: 0.7599 (0.7411)  grad_norm: 0.4270 (0.4699)  time: 5.5755  data: 0.0001  max mem: 71357
+[04:36:15.966734] Epoch: [3]  [3230/6500]  lr: 0.000008  closs: 0.6817 (0.7410)  grad_norm: 0.4210 (0.4698)  time: 5.5765  data: 0.0001  max mem: 71357
+[04:37:11.668373] Epoch: [3]  [3240/6500]  lr: 0.000008  closs: 0.6817 (0.7409)  grad_norm: 0.4270 (0.4698)  time: 5.5748  data: 0.0001  max mem: 71357
+[04:38:07.436973] Epoch: [3]  [3250/6500]  lr: 0.000008  closs: 0.7496 (0.7410)  grad_norm: 0.4443 (0.4702)  time: 5.5734  data: 0.0001  max mem: 71357
+[04:39:03.208428] Epoch: [3]  [3260/6500]  lr: 0.000008  closs: 0.6809 (0.7408)  grad_norm: 0.4437 (0.4700)  time: 5.5769  data: 0.0001  max mem: 71357
+[04:39:58.909951] Epoch: [3]  [3270/6500]  lr: 0.000008  closs: 0.6809 (0.7408)  grad_norm: 0.4473 (0.4700)  time: 5.5735  data: 0.0001  max mem: 71357
+[04:40:54.726343] Epoch: [3]  [3280/6500]  lr: 0.000008  closs: 0.7186 (0.7407)  grad_norm: 0.4473 (0.4701)  time: 5.5758  data: 0.0001  max mem: 71357
+[04:41:50.415725] Epoch: [3]  [3290/6500]  lr: 0.000008  closs: 0.7186 (0.7407)  grad_norm: 0.4437 (0.4699)  time: 5.5752  data: 0.0001  max mem: 71357
+[04:42:46.123758] Epoch: [3]  [3300/6500]  lr: 0.000008  closs: 0.7253 (0.7407)  grad_norm: 0.4245 (0.4697)  time: 5.5698  data: 0.0001  max mem: 71357
+[04:43:41.840664] Epoch: [3]  [3310/6500]  lr: 0.000008  closs: 0.6994 (0.7406)  grad_norm: 0.4150 (0.4696)  time: 5.5712  data: 0.0001  max mem: 71357
+[04:44:37.594666] Epoch: [3]  [3320/6500]  lr: 0.000008  closs: 0.7145 (0.7406)  grad_norm: 0.4150 (0.4695)  time: 5.5735  data: 0.0001  max mem: 71357
+[04:45:33.240389] Epoch: [3]  [3330/6500]  lr: 0.000008  closs: 0.7095 (0.7405)  grad_norm: 0.4161 (0.4696)  time: 5.5699  data: 0.0001  max mem: 71357
+[04:46:28.959317] Epoch: [3]  [3340/6500]  lr: 0.000008  closs: 0.7035 (0.7405)  grad_norm: 0.4373 (0.4700)  time: 5.5681  data: 0.0001  max mem: 71357
+[04:47:24.606653] Epoch: [3]  [3350/6500]  lr: 0.000008  closs: 0.7035 (0.7404)  grad_norm: 0.4373 (0.4700)  time: 5.5682  data: 0.0001  max mem: 71357
+[04:48:20.364255] Epoch: [3]  [3360/6500]  lr: 0.000008  closs: 0.7093 (0.7403)  grad_norm: 0.4540 (0.4699)  time: 5.5702  data: 0.0001  max mem: 71357
+[04:49:16.119586] Epoch: [3]  [3370/6500]  lr: 0.000008  closs: 0.7780 (0.7404)  grad_norm: 0.4645 (0.4699)  time: 5.5756  data: 0.0001  max mem: 71357
+[04:50:11.776158] Epoch: [3]  [3380/6500]  lr: 0.000008  closs: 0.8124 (0.7406)  grad_norm: 0.4137 (0.4697)  time: 5.5705  data: 0.0001  max mem: 71357
+[04:51:07.474741] Epoch: [3]  [3390/6500]  lr: 0.000008  closs: 0.7139 (0.7405)  grad_norm: 0.4485 (0.4700)  time: 5.5677  data: 0.0001  max mem: 71357
+[04:52:03.267438] Epoch: [3]  [3400/6500]  lr: 0.000008  closs: 0.6892 (0.7405)  grad_norm: 0.3771 (0.4697)  time: 5.5745  data: 0.0001  max mem: 71357
+[04:52:59.023647] Epoch: [3]  [3410/6500]  lr: 0.000008  closs: 0.7276 (0.7407)  grad_norm: 0.3916 (0.4701)  time: 5.5774  data: 0.0001  max mem: 71357
+[04:53:54.706683] Epoch: [3]  [3420/6500]  lr: 0.000008  closs: 0.6954 (0.7408)  grad_norm: 0.3916 (0.4698)  time: 5.5719  data: 0.0001  max mem: 71357
+[04:54:50.377473] Epoch: [3]  [3430/6500]  lr: 0.000008  closs: 0.6899 (0.7407)  grad_norm: 0.4018 (0.4699)  time: 5.5676  data: 0.0001  max mem: 71357
+[04:55:46.036905] Epoch: [3]  [3440/6500]  lr: 0.000008  closs: 0.7141 (0.7407)  grad_norm: 0.4018 (0.4698)  time: 5.5664  data: 0.0001  max mem: 71357
+[04:56:41.749545] Epoch: [3]  [3450/6500]  lr: 0.000008  closs: 0.7220 (0.7406)  grad_norm: 0.3857 (0.4696)  time: 5.5685  data: 0.0001  max mem: 71357
+[04:57:37.559081] Epoch: [3]  [3460/6500]  lr: 0.000008  closs: 0.7657 (0.7406)  grad_norm: 0.3844 (0.4695)  time: 5.5760  data: 0.0001  max mem: 71357
+[04:58:33.212538] Epoch: [3]  [3470/6500]  lr: 0.000008  closs: 0.7399 (0.7406)  grad_norm: 0.3841 (0.4693)  time: 5.5731  data: 0.0001  max mem: 71357
+[04:59:28.895834] Epoch: [3]  [3480/6500]  lr: 0.000008  closs: 0.7152 (0.7408)  grad_norm: 0.4047 (0.4693)  time: 5.5668  data: 0.0001  max mem: 71357
+[05:00:24.526761] Epoch: [3]  [3490/6500]  lr: 0.000008  closs: 0.7117 (0.7407)  grad_norm: 0.4259 (0.4694)  time: 5.5656  data: 0.0001  max mem: 71357
+[05:01:20.288024] Epoch: [3]  [3500/6500]  lr: 0.000008  closs: 0.7111 (0.7405)  grad_norm: 0.4359 (0.4695)  time: 5.5695  data: 0.0001  max mem: 71357
+[05:02:15.993095] Epoch: [3]  [3510/6500]  lr: 0.000008  closs: 0.6038 (0.7403)  grad_norm: 0.4570 (0.4696)  time: 5.5733  data: 0.0001  max mem: 71357
+[05:03:11.767699] Epoch: [3]  [3520/6500]  lr: 0.000008  closs: 0.6809 (0.7404)  grad_norm: 0.4570 (0.4696)  time: 5.5739  data: 0.0001  max mem: 71357
+[05:04:07.414649] Epoch: [3]  [3530/6500]  lr: 0.000008  closs: 0.7217 (0.7403)  grad_norm: 0.4787 (0.4703)  time: 5.5710  data: 0.0001  max mem: 71357
+[05:05:03.283087] Epoch: [3]  [3540/6500]  lr: 0.000008  closs: 0.7217 (0.7402)  grad_norm: 0.3929 (0.4701)  time: 5.5757  data: 0.0001  max mem: 71357
+[05:05:58.925727] Epoch: [3]  [3550/6500]  lr: 0.000007  closs: 0.7480 (0.7403)  grad_norm: 0.3891 (0.4698)  time: 5.5754  data: 0.0001  max mem: 71357
+[05:06:54.610701] Epoch: [3]  [3560/6500]  lr: 0.000007  closs: 0.7596 (0.7405)  grad_norm: 0.3929 (0.4698)  time: 5.5663  data: 0.0001  max mem: 71357
+[05:07:50.323204] Epoch: [3]  [3570/6500]  lr: 0.000007  closs: 0.7339 (0.7403)  grad_norm: 0.3710 (0.4695)  time: 5.5698  data: 0.0001  max mem: 71357
+[05:08:46.047153] Epoch: [3]  [3580/6500]  lr: 0.000007  closs: 0.6691 (0.7402)  grad_norm: 0.3913 (0.4694)  time: 5.5718  data: 0.0001  max mem: 71357
+[05:09:41.796100] Epoch: [3]  [3590/6500]  lr: 0.000007  closs: 0.7296 (0.7402)  grad_norm: 0.4097 (0.4695)  time: 5.5736  data: 0.0001  max mem: 71357
+[05:10:37.416644] Epoch: [3]  [3600/6500]  lr: 0.000007  closs: 0.7296 (0.7400)  grad_norm: 0.4042 (0.4696)  time: 5.5684  data: 0.0001  max mem: 71357
+[05:11:33.195633] Epoch: [3]  [3610/6500]  lr: 0.000007  closs: 0.6830 (0.7399)  grad_norm: 0.4097 (0.4693)  time: 5.5699  data: 0.0001  max mem: 71357
+[05:12:28.863592] Epoch: [3]  [3620/6500]  lr: 0.000007  closs: 0.7189 (0.7399)  grad_norm: 0.4191 (0.4693)  time: 5.5723  data: 0.0001  max mem: 71357
+[05:13:24.628791] Epoch: [3]  [3630/6500]  lr: 0.000007  closs: 0.6991 (0.7398)  grad_norm: 0.4191 (0.4692)  time: 5.5716  data: 0.0001  max mem: 71357
+[05:14:20.229395] Epoch: [3]  [3640/6500]  lr: 0.000007  closs: 0.7327 (0.7402)  grad_norm: 0.3995 (0.4690)  time: 5.5682  data: 0.0001  max mem: 71357
+[05:15:15.944815] Epoch: [3]  [3650/6500]  lr: 0.000007  closs: 0.7718 (0.7402)  grad_norm: 0.4191 (0.4692)  time: 5.5657  data: 0.0001  max mem: 71357
+[05:16:11.642121] Epoch: [3]  [3660/6500]  lr: 0.000007  closs: 0.7550 (0.7403)  grad_norm: 0.4011 (0.4695)  time: 5.5705  data: 0.0001  max mem: 71357
+[05:17:07.479650] Epoch: [3]  [3670/6500]  lr: 0.000007  closs: 0.7673 (0.7402)  grad_norm: 0.3984 (0.4693)  time: 5.5767  data: 0.0001  max mem: 71357
+[05:18:03.202417] Epoch: [3]  [3680/6500]  lr: 0.000007  closs: 0.7821 (0.7403)  grad_norm: 0.3894 (0.4694)  time: 5.5780  data: 0.0001  max mem: 71357
+[05:18:58.921587] Epoch: [3]  [3690/6500]  lr: 0.000007  closs: 0.7572 (0.7405)  grad_norm: 0.3894 (0.4693)  time: 5.5720  data: 0.0001  max mem: 71357
+[05:19:54.690134] Epoch: [3]  [3700/6500]  lr: 0.000007  closs: 0.7508 (0.7406)  grad_norm: 0.4002 (0.4692)  time: 5.5743  data: 0.0001  max mem: 71357
+[05:20:50.328799] Epoch: [3]  [3710/6500]  lr: 0.000007  closs: 0.6726 (0.7405)  grad_norm: 0.4002 (0.4689)  time: 5.5703  data: 0.0001  max mem: 71357
+[05:21:46.081446] Epoch: [3]  [3720/6500]  lr: 0.000007  closs: 0.6688 (0.7405)  grad_norm: 0.4002 (0.4689)  time: 5.5695  data: 0.0001  max mem: 71357
+[05:22:41.792831] Epoch: [3]  [3730/6500]  lr: 0.000007  closs: 0.7399 (0.7405)  grad_norm: 0.4459 (0.4693)  time: 5.5731  data: 0.0001  max mem: 71357
+[05:23:37.391272] Epoch: [3]  [3740/6500]  lr: 0.000007  closs: 0.7180 (0.7404)  grad_norm: 0.4610 (0.4693)  time: 5.5654  data: 0.0001  max mem: 71357
+[05:24:33.095997] Epoch: [3]  [3750/6500]  lr: 0.000007  closs: 0.6927 (0.7403)  grad_norm: 0.4757 (0.4694)  time: 5.5651  data: 0.0001  max mem: 71357
+[05:25:28.837747] Epoch: [3]  [3760/6500]  lr: 0.000007  closs: 0.7164 (0.7405)  grad_norm: 0.4757 (0.4696)  time: 5.5723  data: 0.0001  max mem: 71357
+[05:26:24.545897] Epoch: [3]  [3770/6500]  lr: 0.000007  closs: 0.7164 (0.7405)  grad_norm: 0.4634 (0.4695)  time: 5.5724  data: 0.0001  max mem: 71357
+[05:27:20.234683] Epoch: [3]  [3780/6500]  lr: 0.000007  closs: 0.7184 (0.7405)  grad_norm: 0.4144 (0.4693)  time: 5.5698  data: 0.0001  max mem: 71357
+[05:28:15.998002] Epoch: [3]  [3790/6500]  lr: 0.000007  closs: 0.7094 (0.7405)  grad_norm: 0.3937 (0.4692)  time: 5.5725  data: 0.0001  max mem: 71357
+[05:29:11.732433] Epoch: [3]  [3800/6500]  lr: 0.000007  closs: 0.6876 (0.7406)  grad_norm: 0.3814 (0.4693)  time: 5.5748  data: 0.0001  max mem: 71357
+[05:30:07.575713] Epoch: [3]  [3810/6500]  lr: 0.000007  closs: 0.7399 (0.7407)  grad_norm: 0.3745 (0.4689)  time: 5.5788  data: 0.0001  max mem: 71357
+[05:31:03.224368] Epoch: [3]  [3820/6500]  lr: 0.000007  closs: 0.7399 (0.7408)  grad_norm: 0.3791 (0.4691)  time: 5.5745  data: 0.0001  max mem: 71357
+[05:31:58.833849] Epoch: [3]  [3830/6500]  lr: 0.000007  closs: 0.7631 (0.7406)  grad_norm: 0.4356 (0.4691)  time: 5.5628  data: 0.0001  max mem: 71357
+[05:32:54.496305] Epoch: [3]  [3840/6500]  lr: 0.000007  closs: 0.6846 (0.7406)  grad_norm: 0.4207 (0.4695)  time: 5.5635  data: 0.0001  max mem: 71357
+[05:33:50.347789] Epoch: [3]  [3850/6500]  lr: 0.000007  closs: 0.7558 (0.7407)  grad_norm: 0.5182 (0.4699)  time: 5.5756  data: 0.0001  max mem: 71357
+[05:34:46.039279] Epoch: [3]  [3860/6500]  lr: 0.000007  closs: 0.7782 (0.7407)  grad_norm: 0.4689 (0.4698)  time: 5.5771  data: 0.0001  max mem: 71357
+[05:35:41.780086] Epoch: [3]  [3870/6500]  lr: 0.000007  closs: 0.7601 (0.7408)  grad_norm: 0.4636 (0.4706)  time: 5.5716  data: 0.0001  max mem: 71357
+[05:36:37.492954] Epoch: [3]  [3880/6500]  lr: 0.000007  closs: 0.7601 (0.7409)  grad_norm: 0.4093 (0.4704)  time: 5.5726  data: 0.0001  max mem: 71357
+[05:37:33.218084] Epoch: [3]  [3890/6500]  lr: 0.000007  closs: 0.7292 (0.7407)  grad_norm: 0.3931 (0.4702)  time: 5.5718  data: 0.0001  max mem: 71357
+[05:38:29.017484] Epoch: [3]  [3900/6500]  lr: 0.000007  closs: 0.7349 (0.7407)  grad_norm: 0.4056 (0.4702)  time: 5.5761  data: 0.0001  max mem: 71357
+[05:39:24.708152] Epoch: [3]  [3910/6500]  lr: 0.000007  closs: 0.8072 (0.7410)  grad_norm: 0.3931 (0.4700)  time: 5.5744  data: 0.0001  max mem: 71357
+[05:40:20.454098] Epoch: [3]  [3920/6500]  lr: 0.000007  closs: 0.8075 (0.7412)  grad_norm: 0.3901 (0.4698)  time: 5.5717  data: 0.0001  max mem: 71357
+[05:41:16.072069] Epoch: [3]  [3930/6500]  lr: 0.000007  closs: 0.7397 (0.7413)  grad_norm: 0.3941 (0.4698)  time: 5.5681  data: 0.0001  max mem: 71357
+[05:42:11.874456] Epoch: [3]  [3940/6500]  lr: 0.000007  closs: 0.7319 (0.7412)  grad_norm: 0.3901 (0.4697)  time: 5.5710  data: 0.0001  max mem: 71357
+[05:43:07.571147] Epoch: [3]  [3950/6500]  lr: 0.000007  closs: 0.7194 (0.7411)  grad_norm: 0.4072 (0.4699)  time: 5.5749  data: 0.0001  max mem: 71357
+[05:44:03.269451] Epoch: [3]  [3960/6500]  lr: 0.000007  closs: 0.6988 (0.7410)  grad_norm: 0.4316 (0.4697)  time: 5.5697  data: 0.0001  max mem: 71357
+[05:44:58.911292] Epoch: [3]  [3970/6500]  lr: 0.000007  closs: 0.6769 (0.7411)  grad_norm: 0.4400 (0.4698)  time: 5.5669  data: 0.0001  max mem: 71357
+[05:45:54.762157] Epoch: [3]  [3980/6500]  lr: 0.000007  closs: 0.6727 (0.7409)  grad_norm: 0.4316 (0.4695)  time: 5.5746  data: 0.0001  max mem: 71357
+[05:46:50.420700] Epoch: [3]  [3990/6500]  lr: 0.000007  closs: 0.7395 (0.7412)  grad_norm: 0.4151 (0.4694)  time: 5.5754  data: 0.0001  max mem: 71357
+[05:47:46.149461] Epoch: [3]  [4000/6500]  lr: 0.000007  closs: 0.7470 (0.7409)  grad_norm: 0.4080 (0.4692)  time: 5.5693  data: 0.0001  max mem: 71357
+[05:48:41.905023] Epoch: [3]  [4010/6500]  lr: 0.000007  closs: 0.6684 (0.7408)  grad_norm: 0.3969 (0.4693)  time: 5.5741  data: 0.0001  max mem: 71357
+[05:49:37.663723] Epoch: [3]  [4020/6500]  lr: 0.000007  closs: 0.7380 (0.7409)  grad_norm: 0.4290 (0.4692)  time: 5.5756  data: 0.0001  max mem: 71357
+[05:50:33.353764] Epoch: [3]  [4030/6500]  lr: 0.000007  closs: 0.7211 (0.7407)  grad_norm: 0.4290 (0.4699)  time: 5.5724  data: 0.0001  max mem: 71357
+[05:51:29.001387] Epoch: [3]  [4040/6500]  lr: 0.000007  closs: 0.7211 (0.7408)  grad_norm: 0.4670 (0.4702)  time: 5.5668  data: 0.0001  max mem: 71357
+[05:52:24.657192] Epoch: [3]  [4050/6500]  lr: 0.000007  closs: 0.7172 (0.7407)  grad_norm: 0.4290 (0.4704)  time: 5.5651  data: 0.0001  max mem: 71357
+[05:53:20.370617] Epoch: [3]  [4060/6500]  lr: 0.000007  closs: 0.6402 (0.7406)  grad_norm: 0.4262 (0.4703)  time: 5.5684  data: 0.0001  max mem: 71357
+[05:54:16.087743] Epoch: [3]  [4070/6500]  lr: 0.000007  closs: 0.6991 (0.7406)  grad_norm: 0.4298 (0.4702)  time: 5.5714  data: 0.0001  max mem: 71357
+[05:55:11.720085] Epoch: [3]  [4080/6500]  lr: 0.000007  closs: 0.7579 (0.7408)  grad_norm: 0.4249 (0.4702)  time: 5.5674  data: 0.0001  max mem: 71357
+[05:56:07.403518] Epoch: [3]  [4090/6500]  lr: 0.000007  closs: 0.7559 (0.7408)  grad_norm: 0.4282 (0.4702)  time: 5.5657  data: 0.0001  max mem: 71357
+[05:57:03.082301] Epoch: [3]  [4100/6500]  lr: 0.000007  closs: 0.6847 (0.7406)  grad_norm: 0.3763 (0.4701)  time: 5.5680  data: 0.0001  max mem: 71357
+[05:57:58.773781] Epoch: [3]  [4110/6500]  lr: 0.000007  closs: 0.6676 (0.7406)  grad_norm: 0.3763 (0.4703)  time: 5.5684  data: 0.0001  max mem: 71357
+[05:58:54.385862] Epoch: [3]  [4120/6500]  lr: 0.000007  closs: 0.7262 (0.7404)  grad_norm: 0.4959 (0.4706)  time: 5.5651  data: 0.0001  max mem: 71357
+[05:59:50.104040] Epoch: [3]  [4130/6500]  lr: 0.000007  closs: 0.7262 (0.7403)  grad_norm: 0.4959 (0.4705)  time: 5.5664  data: 0.0001  max mem: 71357
+[06:00:45.692846] Epoch: [3]  [4140/6500]  lr: 0.000007  closs: 0.7231 (0.7404)  grad_norm: 0.5343 (0.4706)  time: 5.5653  data: 0.0001  max mem: 71357
+[06:01:41.355915] Epoch: [3]  [4150/6500]  lr: 0.000007  closs: 0.7379 (0.7404)  grad_norm: 0.5177 (0.4707)  time: 5.5625  data: 0.0001  max mem: 71357
+[06:02:37.089968] Epoch: [3]  [4160/6500]  lr: 0.000007  closs: 0.7380 (0.7404)  grad_norm: 0.4467 (0.4707)  time: 5.5698  data: 0.0001  max mem: 71357
+[06:03:32.774291] Epoch: [3]  [4170/6500]  lr: 0.000007  closs: 0.7332 (0.7404)  grad_norm: 0.5048 (0.4711)  time: 5.5708  data: 0.0001  max mem: 71357
+[06:04:28.497632] Epoch: [3]  [4180/6500]  lr: 0.000007  closs: 0.7578 (0.7405)  grad_norm: 0.4494 (0.4712)  time: 5.5703  data: 0.0001  max mem: 71357
+[06:05:24.216613] Epoch: [3]  [4190/6500]  lr: 0.000007  closs: 0.7601 (0.7405)  grad_norm: 0.4129 (0.4710)  time: 5.5720  data: 0.0001  max mem: 71357
+[06:06:20.098227] Epoch: [3]  [4200/6500]  lr: 0.000007  closs: 0.7665 (0.7406)  grad_norm: 0.3923 (0.4710)  time: 5.5800  data: 0.0001  max mem: 71357
+[06:07:15.828052] Epoch: [3]  [4210/6500]  lr: 0.000007  closs: 0.7841 (0.7408)  grad_norm: 0.3622 (0.4709)  time: 5.5805  data: 0.0001  max mem: 71357
+[06:08:11.532027] Epoch: [3]  [4220/6500]  lr: 0.000007  closs: 0.7194 (0.7406)  grad_norm: 0.3691 (0.4706)  time: 5.5716  data: 0.0001  max mem: 71357
+[06:09:07.164298] Epoch: [3]  [4230/6500]  lr: 0.000006  closs: 0.6699 (0.7404)  grad_norm: 0.3812 (0.4724)  time: 5.5667  data: 0.0001  max mem: 71357
+[06:10:02.992629] Epoch: [3]  [4240/6500]  lr: 0.000006  closs: 0.6629 (0.7404)  grad_norm: 0.4220 (0.4724)  time: 5.5729  data: 0.0001  max mem: 71357
+[06:10:58.745878] Epoch: [3]  [4250/6500]  lr: 0.000006  closs: 0.7612 (0.7405)  grad_norm: 0.3945 (0.4722)  time: 5.5790  data: 0.0001  max mem: 71357
+[06:11:54.440900] Epoch: [3]  [4260/6500]  lr: 0.000006  closs: 0.7949 (0.7407)  grad_norm: 0.4455 (0.4722)  time: 5.5724  data: 0.0001  max mem: 71357
+[06:12:50.182793] Epoch: [3]  [4270/6500]  lr: 0.000006  closs: 0.8130 (0.7407)  grad_norm: 0.4455 (0.4722)  time: 5.5718  data: 0.0001  max mem: 71357
+[06:13:45.770945] Epoch: [3]  [4280/6500]  lr: 0.000006  closs: 0.6939 (0.7405)  grad_norm: 0.4629 (0.4730)  time: 5.5664  data: 0.0001  max mem: 71357
+[06:14:41.714399] Epoch: [3]  [4290/6500]  lr: 0.000006  closs: 0.7175 (0.7407)  grad_norm: 0.4635 (0.4731)  time: 5.5765  data: 0.0001  max mem: 71357
+[06:15:37.371600] Epoch: [3]  [4300/6500]  lr: 0.000006  closs: 0.8015 (0.7408)  grad_norm: 0.4426 (0.4728)  time: 5.5799  data: 0.0001  max mem: 71357
+[06:16:33.108011] Epoch: [3]  [4310/6500]  lr: 0.000006  closs: 0.7606 (0.7409)  grad_norm: 0.4413 (0.4732)  time: 5.5696  data: 0.0001  max mem: 71357
+[06:17:28.770795] Epoch: [3]  [4320/6500]  lr: 0.000006  closs: 0.7072 (0.7408)  grad_norm: 0.3953 (0.4731)  time: 5.5699  data: 0.0001  max mem: 71357
+[06:18:24.617908] Epoch: [3]  [4330/6500]  lr: 0.000006  closs: 0.7142 (0.7409)  grad_norm: 0.3763 (0.4730)  time: 5.5754  data: 0.0001  max mem: 71357
+[06:19:20.284502] Epoch: [3]  [4340/6500]  lr: 0.000006  closs: 0.7488 (0.7408)  grad_norm: 0.4262 (0.4732)  time: 5.5756  data: 0.0001  max mem: 71357
+[06:20:16.015270] Epoch: [3]  [4350/6500]  lr: 0.000006  closs: 0.6938 (0.7408)  grad_norm: 0.4168 (0.4731)  time: 5.5698  data: 0.0001  max mem: 71357
+[06:21:11.725133] Epoch: [3]  [4360/6500]  lr: 0.000006  closs: 0.7256 (0.7410)  grad_norm: 0.4232 (0.4732)  time: 5.5720  data: 0.0001  max mem: 71357
+[06:22:07.350207] Epoch: [3]  [4370/6500]  lr: 0.000006  closs: 0.7400 (0.7410)  grad_norm: 0.4398 (0.4732)  time: 5.5667  data: 0.0001  max mem: 71357
+[06:23:03.236264] Epoch: [3]  [4380/6500]  lr: 0.000006  closs: 0.7400 (0.7410)  grad_norm: 0.4398 (0.4735)  time: 5.5755  data: 0.0001  max mem: 71357
+[06:23:58.889034] Epoch: [3]  [4390/6500]  lr: 0.000006  closs: 0.7240 (0.7409)  grad_norm: 0.4513 (0.4736)  time: 5.5769  data: 0.0001  max mem: 71357
+[06:24:54.533006] Epoch: [3]  [4400/6500]  lr: 0.000006  closs: 0.7240 (0.7408)  grad_norm: 0.4513 (0.4737)  time: 5.5647  data: 0.0001  max mem: 71357
+[06:25:50.206903] Epoch: [3]  [4410/6500]  lr: 0.000006  closs: 0.7493 (0.7411)  grad_norm: 0.4513 (0.4737)  time: 5.5658  data: 0.0001  max mem: 71357
+[06:26:45.995600] Epoch: [3]  [4420/6500]  lr: 0.000006  closs: 0.7180 (0.7411)  grad_norm: 0.4341 (0.4776)  time: 5.5730  data: 0.0001  max mem: 71357
+[06:27:41.697454] Epoch: [3]  [4430/6500]  lr: 0.000006  closs: 0.6501 (0.7409)  grad_norm: 0.4669 (0.4775)  time: 5.5745  data: 0.0001  max mem: 71357
+[06:28:37.418266] Epoch: [3]  [4440/6500]  lr: 0.000006  closs: 0.6607 (0.7407)  grad_norm: 0.4669 (0.4775)  time: 5.5711  data: 0.0001  max mem: 71357
+[06:29:33.035519] Epoch: [3]  [4450/6500]  lr: 0.000006  closs: 0.6990 (0.7406)  grad_norm: 0.4688 (0.4777)  time: 5.5668  data: 0.0001  max mem: 71357
+[06:30:28.813181] Epoch: [3]  [4460/6500]  lr: 0.000006  closs: 0.7062 (0.7406)  grad_norm: 0.4669 (0.4776)  time: 5.5697  data: 0.0001  max mem: 71357
+[06:31:24.518637] Epoch: [3]  [4470/6500]  lr: 0.000006  closs: 0.8034 (0.7408)  grad_norm: 0.4726 (0.4779)  time: 5.5741  data: 0.0001  max mem: 71357
+[06:32:20.176534] Epoch: [3]  [4480/6500]  lr: 0.000006  closs: 0.7781 (0.7407)  grad_norm: 0.4556 (0.4779)  time: 5.5681  data: 0.0001  max mem: 71357
+[06:33:15.934576] Epoch: [3]  [4490/6500]  lr: 0.000006  closs: 0.7453 (0.7408)  grad_norm: 0.4051 (0.4776)  time: 5.5707  data: 0.0001  max mem: 71357
+[06:34:11.633237] Epoch: [3]  [4500/6500]  lr: 0.000006  closs: 0.7150 (0.7406)  grad_norm: 0.4099 (0.4775)  time: 5.5728  data: 0.0001  max mem: 71357
+[06:35:07.409947] Epoch: [3]  [4510/6500]  lr: 0.000006  closs: 0.6874 (0.7405)  grad_norm: 0.4041 (0.4775)  time: 5.5737  data: 0.0001  max mem: 71357
+[06:36:03.057657] Epoch: [3]  [4520/6500]  lr: 0.000006  closs: 0.7244 (0.7407)  grad_norm: 0.4633 (0.4778)  time: 5.5711  data: 0.0001  max mem: 71357
+[06:36:58.751863] Epoch: [3]  [4530/6500]  lr: 0.000006  closs: 0.7346 (0.7407)  grad_norm: 0.4917 (0.4777)  time: 5.5670  data: 0.0001  max mem: 71357
+[06:37:54.453452] Epoch: [3]  [4540/6500]  lr: 0.000006  closs: 0.7567 (0.7407)  grad_norm: 0.4633 (0.4776)  time: 5.5697  data: 0.0001  max mem: 71357
+[06:38:50.277539] Epoch: [3]  [4550/6500]  lr: 0.000006  closs: 0.7567 (0.7407)  grad_norm: 0.4308 (0.4778)  time: 5.5762  data: 0.0001  max mem: 71357
+[06:39:45.982251] Epoch: [3]  [4560/6500]  lr: 0.000006  closs: 0.7651 (0.7409)  grad_norm: 0.4175 (0.4777)  time: 5.5764  data: 0.0001  max mem: 71357
+[06:40:41.651886] Epoch: [3]  [4570/6500]  lr: 0.000006  closs: 0.7811 (0.7410)  grad_norm: 0.3881 (0.4776)  time: 5.5686  data: 0.0001  max mem: 71357
+[06:41:37.315108] Epoch: [3]  [4580/6500]  lr: 0.000006  closs: 0.7358 (0.7410)  grad_norm: 0.3934 (0.4776)  time: 5.5666  data: 0.0001  max mem: 71357
+[06:42:33.052296] Epoch: [3]  [4590/6500]  lr: 0.000006  closs: 0.7234 (0.7410)  grad_norm: 0.3934 (0.4775)  time: 5.5700  data: 0.0001  max mem: 71357
+[06:43:28.866940] Epoch: [3]  [4600/6500]  lr: 0.000006  closs: 0.7615 (0.7411)  grad_norm: 0.4202 (0.4775)  time: 5.5775  data: 0.0001  max mem: 71357
+[06:44:24.515048] Epoch: [3]  [4610/6500]  lr: 0.000006  closs: 0.6976 (0.7410)  grad_norm: 0.4237 (0.4774)  time: 5.5731  data: 0.0001  max mem: 71357
+[06:45:20.203725] Epoch: [3]  [4620/6500]  lr: 0.000006  closs: 0.6506 (0.7410)  grad_norm: 0.4171 (0.4772)  time: 5.5668  data: 0.0001  max mem: 71357
+[06:46:15.879177] Epoch: [3]  [4630/6500]  lr: 0.000006  closs: 0.7314 (0.7410)  grad_norm: 0.4004 (0.4769)  time: 5.5681  data: 0.0001  max mem: 71357
+[06:47:11.707137] Epoch: [3]  [4640/6500]  lr: 0.000006  closs: 0.7390 (0.7410)  grad_norm: 0.3716 (0.4768)  time: 5.5751  data: 0.0001  max mem: 71357
+[06:48:07.323873] Epoch: [3]  [4650/6500]  lr: 0.000006  closs: 0.7443 (0.7409)  grad_norm: 0.3923 (0.4769)  time: 5.5722  data: 0.0001  max mem: 71357
+[06:49:03.064604] Epoch: [3]  [4660/6500]  lr: 0.000006  closs: 0.7443 (0.7409)  grad_norm: 0.3923 (0.4767)  time: 5.5678  data: 0.0001  max mem: 71357
+[06:49:58.767683] Epoch: [3]  [4670/6500]  lr: 0.000006  closs: 0.7634 (0.7410)  grad_norm: 0.3923 (0.4766)  time: 5.5721  data: 0.0001  max mem: 71357
+[06:50:54.566372] Epoch: [3]  [4680/6500]  lr: 0.000006  closs: 0.7191 (0.7408)  grad_norm: 0.3871 (0.4765)  time: 5.5750  data: 0.0001  max mem: 71357
+[06:51:50.323458] Epoch: [3]  [4690/6500]  lr: 0.000006  closs: 0.7153 (0.7407)  grad_norm: 0.3820 (0.4766)  time: 5.5777  data: 0.0001  max mem: 71357
+[06:52:46.085085] Epoch: [3]  [4700/6500]  lr: 0.000006  closs: 0.6925 (0.7407)  grad_norm: 0.3504 (0.4763)  time: 5.5759  data: 0.0001  max mem: 71357
+[06:53:41.820424] Epoch: [3]  [4710/6500]  lr: 0.000006  closs: 0.6925 (0.7406)  grad_norm: 0.3788 (0.4764)  time: 5.5747  data: 0.0001  max mem: 71357
+[06:54:37.430020] Epoch: [3]  [4720/6500]  lr: 0.000006  closs: 0.7173 (0.7405)  grad_norm: 0.3988 (0.4764)  time: 5.5671  data: 0.0001  max mem: 71357
+[06:55:33.170954] Epoch: [3]  [4730/6500]  lr: 0.000006  closs: 0.7568 (0.7407)  grad_norm: 0.4271 (0.4763)  time: 5.5674  data: 0.0001  max mem: 71357
+[06:56:28.817636] Epoch: [3]  [4740/6500]  lr: 0.000006  closs: 0.7464 (0.7408)  grad_norm: 0.4350 (0.4761)  time: 5.5693  data: 0.0001  max mem: 71357
+[06:57:24.442554] Epoch: [3]  [4750/6500]  lr: 0.000006  closs: 0.7297 (0.7408)  grad_norm: 0.4253 (0.4760)  time: 5.5635  data: 0.0001  max mem: 71357
+[06:58:20.075994] Epoch: [3]  [4760/6500]  lr: 0.000006  closs: 0.7300 (0.7408)  grad_norm: 0.4253 (0.4768)  time: 5.5628  data: 0.0001  max mem: 71357
+[06:59:15.879354] Epoch: [3]  [4770/6500]  lr: 0.000006  closs: 0.6813 (0.7408)  grad_norm: 0.4014 (0.4768)  time: 5.5718  data: 0.0001  max mem: 71357
+[07:00:11.589236] Epoch: [3]  [4780/6500]  lr: 0.000006  closs: 0.6824 (0.7409)  grad_norm: 0.4178 (0.4767)  time: 5.5756  data: 0.0001  max mem: 71357
+[07:01:07.310329] Epoch: [3]  [4790/6500]  lr: 0.000006  closs: 0.7243 (0.7409)  grad_norm: 0.4163 (0.4766)  time: 5.5714  data: 0.0001  max mem: 71357
+[07:02:03.036933] Epoch: [3]  [4800/6500]  lr: 0.000006  closs: 0.7869 (0.7410)  grad_norm: 0.4163 (0.4766)  time: 5.5723  data: 0.0001  max mem: 71357
+[07:02:58.635514] Epoch: [3]  [4810/6500]  lr: 0.000006  closs: 0.7832 (0.7410)  grad_norm: 0.4575 (0.4767)  time: 5.5662  data: 0.0001  max mem: 71357
+[07:03:54.439261] Epoch: [3]  [4820/6500]  lr: 0.000006  closs: 0.7549 (0.7410)  grad_norm: 0.4702 (0.4769)  time: 5.5701  data: 0.0001  max mem: 71357
+[07:04:50.068310] Epoch: [3]  [4830/6500]  lr: 0.000006  closs: 0.7156 (0.7410)  grad_norm: 0.4575 (0.4768)  time: 5.5716  data: 0.0001  max mem: 71357
+[07:05:45.752970] Epoch: [3]  [4840/6500]  lr: 0.000006  closs: 0.7375 (0.7409)  grad_norm: 0.4076 (0.4766)  time: 5.5656  data: 0.0001  max mem: 71357
+[07:06:41.332388] Epoch: [3]  [4850/6500]  lr: 0.000006  closs: 0.7409 (0.7410)  grad_norm: 0.4074 (0.4767)  time: 5.5631  data: 0.0001  max mem: 71357
+[07:07:37.187318] Epoch: [3]  [4860/6500]  lr: 0.000006  closs: 0.6977 (0.7409)  grad_norm: 0.4074 (0.4767)  time: 5.5716  data: 0.0001  max mem: 71357
+[07:08:32.915254] Epoch: [3]  [4870/6500]  lr: 0.000006  closs: 0.6836 (0.7408)  grad_norm: 0.4279 (0.4767)  time: 5.5790  data: 0.0001  max mem: 71357
+[07:09:28.532950] Epoch: [3]  [4880/6500]  lr: 0.000006  closs: 0.7120 (0.7409)  grad_norm: 0.4074 (0.4766)  time: 5.5672  data: 0.0001  max mem: 71357
+[07:10:24.255320] Epoch: [3]  [4890/6500]  lr: 0.000006  closs: 0.7367 (0.7407)  grad_norm: 0.4095 (0.4767)  time: 5.5669  data: 0.0001  max mem: 71357
+[07:11:20.031107] Epoch: [3]  [4900/6500]  lr: 0.000006  closs: 0.7259 (0.7407)  grad_norm: 0.4011 (0.4766)  time: 5.5748  data: 0.0001  max mem: 71357
+[07:12:15.812216] Epoch: [3]  [4910/6500]  lr: 0.000006  closs: 0.7271 (0.7408)  grad_norm: 0.4011 (0.4765)  time: 5.5777  data: 0.0001  max mem: 71357
+[07:13:11.520623] Epoch: [3]  [4920/6500]  lr: 0.000006  closs: 0.7102 (0.7408)  grad_norm: 0.4011 (0.4763)  time: 5.5744  data: 0.0001  max mem: 71357
+[07:14:07.160205] Epoch: [3]  [4930/6500]  lr: 0.000006  closs: 0.7200 (0.7408)  grad_norm: 0.3937 (0.4763)  time: 5.5673  data: 0.0001  max mem: 71357
+[07:15:02.721256] Epoch: [3]  [4940/6500]  lr: 0.000006  closs: 0.7026 (0.7407)  grad_norm: 0.4128 (0.4766)  time: 5.5599  data: 0.0001  max mem: 71357
+[07:15:58.537548] Epoch: [3]  [4950/6500]  lr: 0.000006  closs: 0.6585 (0.7407)  grad_norm: 0.4183 (0.4768)  time: 5.5688  data: 0.0001  max mem: 71357
+[07:16:54.210667] Epoch: [3]  [4960/6500]  lr: 0.000006  closs: 0.6897 (0.7407)  grad_norm: 0.4334 (0.4766)  time: 5.5744  data: 0.0001  max mem: 71357
+[07:17:49.890698] Epoch: [3]  [4970/6500]  lr: 0.000006  closs: 0.7234 (0.7405)  grad_norm: 0.4334 (0.4766)  time: 5.5676  data: 0.0001  max mem: 71357
+[07:18:45.562111] Epoch: [3]  [4980/6500]  lr: 0.000006  closs: 0.7334 (0.7404)  grad_norm: 0.3593 (0.4764)  time: 5.5675  data: 0.0001  max mem: 71357
+[07:19:41.353861] Epoch: [3]  [4990/6500]  lr: 0.000006  closs: 0.6968 (0.7404)  grad_norm: 0.3642 (0.4765)  time: 5.5731  data: 0.0001  max mem: 71357
+[07:20:37.076513] Epoch: [3]  [5000/6500]  lr: 0.000006  closs: 0.6968 (0.7405)  grad_norm: 0.4023 (0.4765)  time: 5.5757  data: 0.0001  max mem: 71357
+[07:21:32.711715] Epoch: [3]  [5010/6500]  lr: 0.000006  closs: 0.7888 (0.7405)  grad_norm: 0.4166 (0.4765)  time: 5.5678  data: 0.0001  max mem: 71357
+[07:22:28.392680] Epoch: [3]  [5020/6500]  lr: 0.000006  closs: 0.8213 (0.7407)  grad_norm: 0.4392 (0.4764)  time: 5.5657  data: 0.0001  max mem: 71357
+[07:23:24.053200] Epoch: [3]  [5030/6500]  lr: 0.000006  closs: 0.8213 (0.7408)  grad_norm: 0.4247 (0.4763)  time: 5.5670  data: 0.0001  max mem: 71357
+[07:24:19.885504] Epoch: [3]  [5040/6500]  lr: 0.000006  closs: 0.7937 (0.7409)  grad_norm: 0.4176 (0.4762)  time: 5.5745  data: 0.0001  max mem: 71357
+[07:25:15.541561] Epoch: [3]  [5050/6500]  lr: 0.000006  closs: 0.7597 (0.7410)  grad_norm: 0.4224 (0.4763)  time: 5.5743  data: 0.0001  max mem: 71357
+[07:26:11.374618] Epoch: [3]  [5060/6500]  lr: 0.000006  closs: 0.6788 (0.7409)  grad_norm: 0.4224 (0.4761)  time: 5.5744  data: 0.0001  max mem: 71357
+[07:27:07.078997] Epoch: [3]  [5070/6500]  lr: 0.000006  closs: 0.7492 (0.7409)  grad_norm: 0.4120 (0.4759)  time: 5.5768  data: 0.0001  max mem: 71357
+[07:28:02.865357] Epoch: [3]  [5080/6500]  lr: 0.000006  closs: 0.7492 (0.7410)  grad_norm: 0.4302 (0.4762)  time: 5.5744  data: 0.0001  max mem: 71357
+[07:28:58.502309] Epoch: [3]  [5090/6500]  lr: 0.000006  closs: 0.7938 (0.7412)  grad_norm: 0.4302 (0.4761)  time: 5.5711  data: 0.0001  max mem: 71357
+[07:29:54.168306] Epoch: [3]  [5100/6500]  lr: 0.000006  closs: 0.7356 (0.7412)  grad_norm: 0.4987 (0.4762)  time: 5.5650  data: 0.0001  max mem: 71357
+[07:30:49.827035] Epoch: [3]  [5110/6500]  lr: 0.000006  closs: 0.7003 (0.7412)  grad_norm: 0.4987 (0.4761)  time: 5.5661  data: 0.0001  max mem: 71357
+[07:31:45.641823] Epoch: [3]  [5120/6500]  lr: 0.000006  closs: 0.7438 (0.7412)  grad_norm: 0.4244 (0.4760)  time: 5.5736  data: 0.0001  max mem: 71357
+[07:32:41.455089] Epoch: [3]  [5130/6500]  lr: 0.000006  closs: 0.7517 (0.7413)  grad_norm: 0.4237 (0.4758)  time: 5.5813  data: 0.0001  max mem: 71357
+[07:33:37.179398] Epoch: [3]  [5140/6500]  lr: 0.000006  closs: 0.7771 (0.7413)  grad_norm: 0.4015 (0.4757)  time: 5.5768  data: 0.0001  max mem: 71357
+[07:34:32.952469] Epoch: [3]  [5150/6500]  lr: 0.000006  closs: 0.6882 (0.7412)  grad_norm: 0.3916 (0.4756)  time: 5.5748  data: 0.0001  max mem: 71357
+[07:35:28.690763] Epoch: [3]  [5160/6500]  lr: 0.000006  closs: 0.7365 (0.7415)  grad_norm: 0.4015 (0.4758)  time: 5.5755  data: 0.0001  max mem: 71357
+[07:36:24.470225] Epoch: [3]  [5170/6500]  lr: 0.000006  closs: 0.7372 (0.7414)  grad_norm: 0.4123 (0.4757)  time: 5.5758  data: 0.0001  max mem: 71357
+[07:37:20.151884] Epoch: [3]  [5180/6500]  lr: 0.000006  closs: 0.7399 (0.7416)  grad_norm: 0.4265 (0.4759)  time: 5.5730  data: 0.0001  max mem: 71357
+[07:38:15.804126] Epoch: [3]  [5190/6500]  lr: 0.000005  closs: 0.7578 (0.7416)  grad_norm: 0.4265 (0.4757)  time: 5.5666  data: 0.0001  max mem: 71357
+[07:39:11.488282] Epoch: [3]  [5200/6500]  lr: 0.000005  closs: 0.6937 (0.7416)  grad_norm: 0.4066 (0.4756)  time: 5.5667  data: 0.0001  max mem: 71357
+[07:40:07.267455] Epoch: [3]  [5210/6500]  lr: 0.000005  closs: 0.7611 (0.7417)  grad_norm: 0.3831 (0.4756)  time: 5.5731  data: 0.0001  max mem: 71357
+[07:41:03.069520] Epoch: [3]  [5220/6500]  lr: 0.000005  closs: 0.7594 (0.7417)  grad_norm: 0.3831 (0.4756)  time: 5.5790  data: 0.0001  max mem: 71357
+[07:41:58.738721] Epoch: [3]  [5230/6500]  lr: 0.000005  closs: 0.6946 (0.7416)  grad_norm: 0.4384 (0.4756)  time: 5.5735  data: 0.0001  max mem: 71357
+[07:42:54.394666] Epoch: [3]  [5240/6500]  lr: 0.000005  closs: 0.5834 (0.7414)  grad_norm: 0.4770 (0.4757)  time: 5.5662  data: 0.0001  max mem: 71357
+[07:43:50.008917] Epoch: [3]  [5250/6500]  lr: 0.000005  closs: 0.6269 (0.7414)  grad_norm: 0.4798 (0.4757)  time: 5.5634  data: 0.0001  max mem: 71357
+[07:44:45.778422] Epoch: [3]  [5260/6500]  lr: 0.000005  closs: 0.7557 (0.7414)  grad_norm: 0.4838 (0.4758)  time: 5.5691  data: 0.0001  max mem: 71357
+[07:45:41.527673] Epoch: [3]  [5270/6500]  lr: 0.000005  closs: 0.7557 (0.7415)  grad_norm: 0.5086 (0.4760)  time: 5.5759  data: 0.0001  max mem: 71357
+[07:46:37.191214] Epoch: [3]  [5280/6500]  lr: 0.000005  closs: 0.7615 (0.7416)  grad_norm: 0.4950 (0.4759)  time: 5.5706  data: 0.0001  max mem: 71357
+[07:47:32.847414] Epoch: [3]  [5290/6500]  lr: 0.000005  closs: 0.7668 (0.7417)  grad_norm: 0.4947 (0.4758)  time: 5.5659  data: 0.0001  max mem: 71357
+[07:48:28.717965] Epoch: [3]  [5300/6500]  lr: 0.000005  closs: 0.7729 (0.7417)  grad_norm: 0.4736 (0.4759)  time: 5.5763  data: 0.0001  max mem: 71357
+[07:49:24.422934] Epoch: [3]  [5310/6500]  lr: 0.000005  closs: 0.7493 (0.7416)  grad_norm: 0.4380 (0.4759)  time: 5.5787  data: 0.0001  max mem: 71357
+[07:50:20.140110] Epoch: [3]  [5320/6500]  lr: 0.000005  closs: 0.7367 (0.7417)  grad_norm: 0.4736 (0.4760)  time: 5.5710  data: 0.0001  max mem: 71357
+[07:51:15.914706] Epoch: [3]  [5330/6500]  lr: 0.000005  closs: 0.7469 (0.7418)  grad_norm: 0.4288 (0.4758)  time: 5.5745  data: 0.0001  max mem: 71357
+[07:52:11.772834] Epoch: [3]  [5340/6500]  lr: 0.000005  closs: 0.7469 (0.7417)  grad_norm: 0.3868 (0.4758)  time: 5.5816  data: 0.0001  max mem: 71357
+[07:53:07.443542] Epoch: [3]  [5350/6500]  lr: 0.000005  closs: 0.7176 (0.7417)  grad_norm: 0.3952 (0.4757)  time: 5.5763  data: 0.0001  max mem: 71357
+[07:54:03.117070] Epoch: [3]  [5360/6500]  lr: 0.000005  closs: 0.7500 (0.7417)  grad_norm: 0.3854 (0.4756)  time: 5.5671  data: 0.0001  max mem: 71357
+[07:54:58.869745] Epoch: [3]  [5370/6500]  lr: 0.000005  closs: 0.7227 (0.7417)  grad_norm: 0.3854 (0.4756)  time: 5.5712  data: 0.0001  max mem: 71357
+[07:55:54.574182] Epoch: [3]  [5380/6500]  lr: 0.000005  closs: 0.7051 (0.7416)  grad_norm: 0.3838 (0.4755)  time: 5.5728  data: 0.0001  max mem: 71357
+[07:56:50.331719] Epoch: [3]  [5390/6500]  lr: 0.000005  closs: 0.7780 (0.7417)  grad_norm: 0.4164 (0.4756)  time: 5.5730  data: 0.0001  max mem: 71357
+[07:57:45.936766] Epoch: [3]  [5400/6500]  lr: 0.000005  closs: 0.7780 (0.7418)  grad_norm: 0.4019 (0.4754)  time: 5.5681  data: 0.0001  max mem: 71357
+[07:58:41.625507] Epoch: [3]  [5410/6500]  lr: 0.000005  closs: 0.7760 (0.7419)  grad_norm: 0.4019 (0.4754)  time: 5.5646  data: 0.0001  max mem: 71357
+[07:59:37.221626] Epoch: [3]  [5420/6500]  lr: 0.000005  closs: 0.7735 (0.7419)  grad_norm: 0.4019 (0.4753)  time: 5.5641  data: 0.0001  max mem: 71357
+[08:00:32.942836] Epoch: [3]  [5430/6500]  lr: 0.000005  closs: 0.7487 (0.7418)  grad_norm: 0.3855 (0.4752)  time: 5.5658  data: 0.0001  max mem: 71357
+[08:01:28.737895] Epoch: [3]  [5440/6500]  lr: 0.000005  closs: 0.7487 (0.7419)  grad_norm: 0.3855 (0.4751)  time: 5.5757  data: 0.0001  max mem: 71357
+[08:02:24.357380] Epoch: [3]  [5450/6500]  lr: 0.000005  closs: 0.7460 (0.7419)  grad_norm: 0.3975 (0.4751)  time: 5.5707  data: 0.0001  max mem: 71357
+[08:03:20.013233] Epoch: [3]  [5460/6500]  lr: 0.000005  closs: 0.7460 (0.7419)  grad_norm: 0.4548 (0.4752)  time: 5.5637  data: 0.0001  max mem: 71357
+[08:04:15.677277] Epoch: [3]  [5470/6500]  lr: 0.000005  closs: 0.8010 (0.7420)  grad_norm: 0.4553 (0.4751)  time: 5.5659  data: 0.0001  max mem: 71357
+[08:05:11.472975] Epoch: [3]  [5480/6500]  lr: 0.000005  closs: 0.7603 (0.7419)  grad_norm: 0.4553 (0.4752)  time: 5.5729  data: 0.0001  max mem: 71357
+[08:06:07.124938] Epoch: [3]  [5490/6500]  lr: 0.000005  closs: 0.7009 (0.7418)  grad_norm: 0.4387 (0.4750)  time: 5.5723  data: 0.0001  max mem: 71357
+[08:07:02.722427] Epoch: [3]  [5500/6500]  lr: 0.000005  closs: 0.7009 (0.7417)  grad_norm: 0.4303 (0.4751)  time: 5.5624  data: 0.0001  max mem: 71357
+[08:07:58.375710] Epoch: [3]  [5510/6500]  lr: 0.000005  closs: 0.6630 (0.7415)  grad_norm: 0.4247 (0.4758)  time: 5.5625  data: 0.0001  max mem: 71357
+[08:08:54.297860] Epoch: [3]  [5520/6500]  lr: 0.000005  closs: 0.7243 (0.7415)  grad_norm: 0.4047 (0.4758)  time: 5.5787  data: 0.0001  max mem: 71357
+[08:09:49.997283] Epoch: [3]  [5530/6500]  lr: 0.000005  closs: 0.7034 (0.7413)  grad_norm: 0.4303 (0.4758)  time: 5.5810  data: 0.0001  max mem: 71357
+[08:10:45.674960] Epoch: [3]  [5540/6500]  lr: 0.000005  closs: 0.7039 (0.7413)  grad_norm: 0.4179 (0.4757)  time: 5.5688  data: 0.0001  max mem: 71357
+[08:11:41.302248] Epoch: [3]  [5550/6500]  lr: 0.000005  closs: 0.7121 (0.7414)  grad_norm: 0.4179 (0.4758)  time: 5.5652  data: 0.0001  max mem: 71357
+[08:12:37.151238] Epoch: [3]  [5560/6500]  lr: 0.000005  closs: 0.6410 (0.7412)  grad_norm: 0.4524 (0.4762)  time: 5.5737  data: 0.0001  max mem: 71357
+[08:13:32.880058] Epoch: [3]  [5570/6500]  lr: 0.000005  closs: 0.6737 (0.7413)  grad_norm: 0.4524 (0.4761)  time: 5.5788  data: 0.0001  max mem: 71357
+[08:14:28.497505] Epoch: [3]  [5580/6500]  lr: 0.000005  closs: 0.7277 (0.7412)  grad_norm: 0.4449 (0.4760)  time: 5.5673  data: 0.0001  max mem: 71357
+[08:15:24.123502] Epoch: [3]  [5590/6500]  lr: 0.000005  closs: 0.7041 (0.7412)  grad_norm: 0.4449 (0.4761)  time: 5.5621  data: 0.0001  max mem: 71357
+[08:16:19.713976] Epoch: [3]  [5600/6500]  lr: 0.000005  closs: 0.7092 (0.7412)  grad_norm: 0.3970 (0.4760)  time: 5.5607  data: 0.0001  max mem: 71357
+[08:17:15.578612] Epoch: [3]  [5610/6500]  lr: 0.000005  closs: 0.7257 (0.7411)  grad_norm: 0.3970 (0.4760)  time: 5.5727  data: 0.0001  max mem: 71357
+[08:18:11.270830] Epoch: [3]  [5620/6500]  lr: 0.000005  closs: 0.7130 (0.7411)  grad_norm: 0.4082 (0.4759)  time: 5.5778  data: 0.0001  max mem: 71357
+[08:19:07.047039] Epoch: [3]  [5630/6500]  lr: 0.000005  closs: 0.6523 (0.7411)  grad_norm: 0.4421 (0.4759)  time: 5.5734  data: 0.0001  max mem: 71357
+[08:20:02.685791] Epoch: [3]  [5640/6500]  lr: 0.000005  closs: 0.6917 (0.7410)  grad_norm: 0.4677 (0.4758)  time: 5.5707  data: 0.0001  max mem: 71357
+[08:20:58.580036] Epoch: [3]  [5650/6500]  lr: 0.000005  closs: 0.7371 (0.7411)  grad_norm: 0.4498 (0.4758)  time: 5.5766  data: 0.0001  max mem: 71357
+[08:21:54.213166] Epoch: [3]  [5660/6500]  lr: 0.000005  closs: 0.7371 (0.7412)  grad_norm: 0.4677 (0.4759)  time: 5.5763  data: 0.0001  max mem: 71357
+[08:22:49.950783] Epoch: [3]  [5670/6500]  lr: 0.000005  closs: 0.7466 (0.7413)  grad_norm: 0.3905 (0.4758)  time: 5.5684  data: 0.0001  max mem: 71357
+[08:23:45.705757] Epoch: [3]  [5680/6500]  lr: 0.000005  closs: 0.7321 (0.7412)  grad_norm: 0.4498 (0.4762)  time: 5.5745  data: 0.0001  max mem: 71357
+[08:24:41.256843] Epoch: [3]  [5690/6500]  lr: 0.000005  closs: 0.6601 (0.7411)  grad_norm: 0.4623 (0.4764)  time: 5.5652  data: 0.0001  max mem: 71357
+[08:25:37.016730] Epoch: [3]  [5700/6500]  lr: 0.000005  closs: 0.6825 (0.7410)  grad_norm: 0.4525 (0.4763)  time: 5.5655  data: 0.0001  max mem: 71357
+[08:26:32.785310] Epoch: [3]  [5710/6500]  lr: 0.000005  closs: 0.6972 (0.7410)  grad_norm: 0.4525 (0.4762)  time: 5.5763  data: 0.0001  max mem: 71357
+[08:27:28.440180] Epoch: [3]  [5720/6500]  lr: 0.000005  closs: 0.7447 (0.7411)  grad_norm: 0.4266 (0.4761)  time: 5.5711  data: 0.0001  max mem: 71357
+[08:28:24.134165] Epoch: [3]  [5730/6500]  lr: 0.000005  closs: 0.7670 (0.7412)  grad_norm: 0.4178 (0.4761)  time: 5.5674  data: 0.0001  max mem: 71357
+[08:29:19.997220] Epoch: [3]  [5740/6500]  lr: 0.000005  closs: 0.7214 (0.7410)  grad_norm: 0.4178 (0.4760)  time: 5.5778  data: 0.0001  max mem: 71357
+[08:30:15.733399] Epoch: [3]  [5750/6500]  lr: 0.000005  closs: 0.6831 (0.7409)  grad_norm: 0.4367 (0.4761)  time: 5.5799  data: 0.0001  max mem: 71357
+[08:31:11.458578] Epoch: [3]  [5760/6500]  lr: 0.000005  closs: 0.6616 (0.7408)  grad_norm: 0.4367 (0.4761)  time: 5.5730  data: 0.0001  max mem: 71357
+[08:32:07.031492] Epoch: [3]  [5770/6500]  lr: 0.000005  closs: 0.7017 (0.7408)  grad_norm: 0.5017 (0.4761)  time: 5.5648  data: 0.0001  max mem: 71357
+[08:33:02.681051] Epoch: [3]  [5780/6500]  lr: 0.000005  closs: 0.7020 (0.7408)  grad_norm: 0.4604 (0.4761)  time: 5.5610  data: 0.0001  max mem: 71357
+[08:33:58.421784] Epoch: [3]  [5790/6500]  lr: 0.000005  closs: 0.7253 (0.7408)  grad_norm: 0.4554 (0.4763)  time: 5.5695  data: 0.0001  max mem: 71357
+[08:34:54.273999] Epoch: [3]  [5800/6500]  lr: 0.000005  closs: 0.7283 (0.7408)  grad_norm: 0.4415 (0.4762)  time: 5.5796  data: 0.0001  max mem: 71357
+[08:35:49.986871] Epoch: [3]  [5810/6500]  lr: 0.000005  closs: 0.7905 (0.7410)  grad_norm: 0.4331 (0.4761)  time: 5.5782  data: 0.0001  max mem: 71357
+[08:36:45.654555] Epoch: [3]  [5820/6500]  lr: 0.000005  closs: 0.7656 (0.7409)  grad_norm: 0.4289 (0.4761)  time: 5.5690  data: 0.0001  max mem: 71357
+[08:37:41.471240] Epoch: [3]  [5830/6500]  lr: 0.000005  closs: 0.6852 (0.7408)  grad_norm: 0.3691 (0.4759)  time: 5.5741  data: 0.0001  max mem: 71357
+[08:38:37.093389] Epoch: [3]  [5840/6500]  lr: 0.000005  closs: 0.6992 (0.7408)  grad_norm: 0.3979 (0.4759)  time: 5.5719  data: 0.0001  max mem: 71357
+[08:39:32.812807] Epoch: [3]  [5850/6500]  lr: 0.000005  closs: 0.7118 (0.7407)  grad_norm: 0.3971 (0.4758)  time: 5.5670  data: 0.0001  max mem: 71357
+[08:40:28.562085] Epoch: [3]  [5860/6500]  lr: 0.000005  closs: 0.7175 (0.7408)  grad_norm: 0.3971 (0.4757)  time: 5.5734  data: 0.0001  max mem: 71357
+[08:41:24.343575] Epoch: [3]  [5870/6500]  lr: 0.000005  closs: 0.7244 (0.7408)  grad_norm: 0.4018 (0.4756)  time: 5.5765  data: 0.0001  max mem: 71357
+[08:42:20.067075] Epoch: [3]  [5880/6500]  lr: 0.000005  closs: 0.7244 (0.7408)  grad_norm: 0.4018 (0.4757)  time: 5.5752  data: 0.0001  max mem: 71357
+[08:43:15.679788] Epoch: [3]  [5890/6500]  lr: 0.000005  closs: 0.7645 (0.7409)  grad_norm: 0.4559 (0.4758)  time: 5.5667  data: 0.0001  max mem: 71357
+[08:44:11.290632] Epoch: [3]  [5900/6500]  lr: 0.000005  closs: 0.7315 (0.7409)  grad_norm: 0.4684 (0.4758)  time: 5.5611  data: 0.0001  max mem: 71357
+[08:45:06.943686] Epoch: [3]  [5910/6500]  lr: 0.000005  closs: 0.7315 (0.7409)  grad_norm: 0.4615 (0.4757)  time: 5.5631  data: 0.0001  max mem: 71357
+[08:46:02.654843] Epoch: [3]  [5920/6500]  lr: 0.000005  closs: 0.7520 (0.7409)  grad_norm: 0.4684 (0.4758)  time: 5.5681  data: 0.0001  max mem: 71357
+[08:46:58.263460] Epoch: [3]  [5930/6500]  lr: 0.000005  closs: 0.7815 (0.7410)  grad_norm: 0.4616 (0.4759)  time: 5.5659  data: 0.0001  max mem: 71357
+[08:47:53.923587] Epoch: [3]  [5940/6500]  lr: 0.000005  closs: 0.7815 (0.7410)  grad_norm: 0.4616 (0.4763)  time: 5.5633  data: 0.0001  max mem: 71357
+[08:48:49.647222] Epoch: [3]  [5950/6500]  lr: 0.000005  closs: 0.7723 (0.7411)  grad_norm: 0.4731 (0.4763)  time: 5.5691  data: 0.0001  max mem: 71357
+[08:49:45.438272] Epoch: [3]  [5960/6500]  lr: 0.000005  closs: 0.6831 (0.7408)  grad_norm: 0.4702 (0.4763)  time: 5.5757  data: 0.0001  max mem: 71357
+[08:50:41.080388] Epoch: [3]  [5970/6500]  lr: 0.000005  closs: 0.7002 (0.7408)  grad_norm: 0.4085 (0.4761)  time: 5.5716  data: 0.0001  max mem: 71357
+[08:51:36.713518] Epoch: [3]  [5980/6500]  lr: 0.000005  closs: 0.7981 (0.7411)  grad_norm: 0.4085 (0.4763)  time: 5.5637  data: 0.0001  max mem: 71357
+[08:52:32.360209] Epoch: [3]  [5990/6500]  lr: 0.000005  closs: 0.8063 (0.7411)  grad_norm: 0.3989 (0.4761)  time: 5.5639  data: 0.0001  max mem: 71357
+[08:53:28.046393] Epoch: [3]  [6000/6500]  lr: 0.000005  closs: 0.7429 (0.7410)  grad_norm: 0.4288 (0.4762)  time: 5.5666  data: 0.0001  max mem: 71357
+[08:54:23.776691] Epoch: [3]  [6010/6500]  lr: 0.000005  closs: 0.7170 (0.7410)  grad_norm: 0.4823 (0.4762)  time: 5.5707  data: 0.0001  max mem: 71357
+[08:55:19.330299] Epoch: [3]  [6020/6500]  lr: 0.000005  closs: 0.6639 (0.7408)  grad_norm: 0.4531 (0.4762)  time: 5.5641  data: 0.0001  max mem: 71357
+[08:56:15.081992] Epoch: [3]  [6030/6500]  lr: 0.000005  closs: 0.6893 (0.7408)  grad_norm: 0.4349 (0.4761)  time: 5.5652  data: 0.0001  max mem: 71357
+[08:57:10.718534] Epoch: [3]  [6040/6500]  lr: 0.000005  closs: 0.7329 (0.7409)  grad_norm: 0.3941 (0.4765)  time: 5.5693  data: 0.0001  max mem: 71357
+[08:58:06.494709] Epoch: [3]  [6050/6500]  lr: 0.000005  closs: 0.8222 (0.7411)  grad_norm: 0.3941 (0.4766)  time: 5.5705  data: 0.0001  max mem: 71357
+[08:59:02.188091] Epoch: [3]  [6060/6500]  lr: 0.000005  closs: 0.7643 (0.7410)  grad_norm: 0.3920 (0.4764)  time: 5.5734  data: 0.0001  max mem: 71357
+[08:59:57.800593] Epoch: [3]  [6070/6500]  lr: 0.000005  closs: 0.7363 (0.7410)  grad_norm: 0.4194 (0.4764)  time: 5.5652  data: 0.0001  max mem: 71357
+[09:00:53.339756] Epoch: [3]  [6080/6500]  lr: 0.000005  closs: 0.7633 (0.7411)  grad_norm: 0.4194 (0.4763)  time: 5.5575  data: 0.0001  max mem: 71357
+[09:01:49.113502] Epoch: [3]  [6090/6500]  lr: 0.000005  closs: 0.7977 (0.7411)  grad_norm: 0.4062 (0.4764)  time: 5.5656  data: 0.0001  max mem: 71357
+[09:02:44.848015] Epoch: [3]  [6100/6500]  lr: 0.000005  closs: 0.8059 (0.7412)  grad_norm: 0.4194 (0.4763)  time: 5.5754  data: 0.0001  max mem: 71357
+[09:03:40.394260] Epoch: [3]  [6110/6500]  lr: 0.000005  closs: 0.7635 (0.7411)  grad_norm: 0.4153 (0.4764)  time: 5.5640  data: 0.0001  max mem: 71357
+[09:04:36.100670] Epoch: [3]  [6120/6500]  lr: 0.000005  closs: 0.6694 (0.7411)  grad_norm: 0.3990 (0.4763)  time: 5.5625  data: 0.0001  max mem: 71357
+[09:05:31.751861] Epoch: [3]  [6130/6500]  lr: 0.000005  closs: 0.7213 (0.7412)  grad_norm: 0.4238 (0.4762)  time: 5.5678  data: 0.0001  max mem: 71357
+[09:06:27.636239] Epoch: [3]  [6140/6500]  lr: 0.000005  closs: 0.7295 (0.7412)  grad_norm: 0.4095 (0.4760)  time: 5.5767  data: 0.0001  max mem: 71357
+[09:07:23.235152] Epoch: [3]  [6150/6500]  lr: 0.000005  closs: 0.7065 (0.7412)  grad_norm: 0.4102 (0.4761)  time: 5.5741  data: 0.0001  max mem: 71357
+[09:08:18.818036] Epoch: [3]  [6160/6500]  lr: 0.000005  closs: 0.7877 (0.7413)  grad_norm: 0.4559 (0.4763)  time: 5.5590  data: 0.0001  max mem: 71357
+[09:09:14.505759] Epoch: [3]  [6170/6500]  lr: 0.000005  closs: 0.7101 (0.7411)  grad_norm: 0.4395 (0.4762)  time: 5.5634  data: 0.0001  max mem: 71357
+[09:10:10.350422] Epoch: [3]  [6180/6500]  lr: 0.000005  closs: 0.6311 (0.7409)  grad_norm: 0.5068 (0.4763)  time: 5.5765  data: 0.0001  max mem: 71357
+[09:11:06.023015] Epoch: [3]  [6190/6500]  lr: 0.000005  closs: 0.7146 (0.7410)  grad_norm: 0.4877 (0.4762)  time: 5.5758  data: 0.0001  max mem: 71357
+[09:12:01.785870] Epoch: [3]  [6200/6500]  lr: 0.000005  closs: 0.7406 (0.7411)  grad_norm: 0.4877 (0.4763)  time: 5.5717  data: 0.0001  max mem: 71357
+[09:12:57.440583] Epoch: [3]  [6210/6500]  lr: 0.000005  closs: 0.7689 (0.7412)  grad_norm: 0.4298 (0.4761)  time: 5.5708  data: 0.0001  max mem: 71357
+[09:13:53.257089] Epoch: [3]  [6220/6500]  lr: 0.000005  closs: 0.7652 (0.7412)  grad_norm: 0.4117 (0.4761)  time: 5.5735  data: 0.0001  max mem: 71357
+[09:14:48.956788] Epoch: [3]  [6230/6500]  lr: 0.000005  closs: 0.7270 (0.7411)  grad_norm: 0.4298 (0.4761)  time: 5.5757  data: 0.0001  max mem: 71357
+[09:15:44.591134] Epoch: [3]  [6240/6500]  lr: 0.000005  closs: 0.7631 (0.7412)  grad_norm: 0.4232 (0.4763)  time: 5.5666  data: 0.0001  max mem: 71357
+[09:16:40.191312] Epoch: [3]  [6250/6500]  lr: 0.000005  closs: 0.8537 (0.7414)  grad_norm: 0.4668 (0.4763)  time: 5.5617  data: 0.0001  max mem: 71357
+[09:17:35.838897] Epoch: [3]  [6260/6500]  lr: 0.000005  closs: 0.8001 (0.7414)  grad_norm: 0.4668 (0.4763)  time: 5.5623  data: 0.0001  max mem: 71357
+[09:18:31.640816] Epoch: [3]  [6270/6500]  lr: 0.000005  closs: 0.7339 (0.7415)  grad_norm: 0.4230 (0.4763)  time: 5.5724  data: 0.0001  max mem: 71357
+[09:19:27.206966] Epoch: [3]  [6280/6500]  lr: 0.000005  closs: 0.7339 (0.7414)  grad_norm: 0.4230 (0.4763)  time: 5.5683  data: 0.0001  max mem: 71357
+[09:20:22.913908] Epoch: [3]  [6290/6500]  lr: 0.000005  closs: 0.7190 (0.7414)  grad_norm: 0.4122 (0.4762)  time: 5.5636  data: 0.0001  max mem: 71357
+[09:21:18.497214] Epoch: [3]  [6300/6500]  lr: 0.000005  closs: 0.7719 (0.7415)  grad_norm: 0.4122 (0.4762)  time: 5.5644  data: 0.0001  max mem: 71357
+[09:22:14.290866] Epoch: [3]  [6310/6500]  lr: 0.000005  closs: 0.7057 (0.7415)  grad_norm: 0.4122 (0.4761)  time: 5.5688  data: 0.0001  max mem: 71357
+[09:23:09.931856] Epoch: [3]  [6320/6500]  lr: 0.000005  closs: 0.6686 (0.7414)  grad_norm: 0.4850 (0.4765)  time: 5.5717  data: 0.0001  max mem: 71357
+[09:24:05.612801] Epoch: [3]  [6330/6500]  lr: 0.000005  closs: 0.6958 (0.7414)  grad_norm: 0.4899 (0.4765)  time: 5.5660  data: 0.0001  max mem: 71357
+[09:25:01.244089] Epoch: [3]  [6340/6500]  lr: 0.000005  closs: 0.7575 (0.7415)  grad_norm: 0.4416 (0.4764)  time: 5.5655  data: 0.0001  max mem: 71357
+[09:25:56.860737] Epoch: [3]  [6350/6500]  lr: 0.000005  closs: 0.7575 (0.7415)  grad_norm: 0.4298 (0.4763)  time: 5.5623  data: 0.0001  max mem: 71357
+[09:26:52.670083] Epoch: [3]  [6360/6500]  lr: 0.000005  closs: 0.7802 (0.7416)  grad_norm: 0.4094 (0.4762)  time: 5.5712  data: 0.0001  max mem: 71357
+[09:27:48.219140] Epoch: [3]  [6370/6500]  lr: 0.000005  closs: 0.7762 (0.7416)  grad_norm: 0.4253 (0.4763)  time: 5.5678  data: 0.0001  max mem: 71357
+[09:28:43.931513] Epoch: [3]  [6380/6500]  lr: 0.000005  closs: 0.7411 (0.7417)  grad_norm: 0.4253 (0.4762)  time: 5.5630  data: 0.0001  max mem: 71357
+[09:29:39.533688] Epoch: [3]  [6390/6500]  lr: 0.000005  closs: 0.7411 (0.7417)  grad_norm: 0.4533 (0.4762)  time: 5.5656  data: 0.0001  max mem: 71357
+[09:30:35.257343] Epoch: [3]  [6400/6500]  lr: 0.000005  closs: 0.7405 (0.7417)  grad_norm: 0.4680 (0.4762)  time: 5.5662  data: 0.0001  max mem: 71357
+[09:31:30.993736] Epoch: [3]  [6410/6500]  lr: 0.000005  closs: 0.7159 (0.7417)  grad_norm: 0.4596 (0.4761)  time: 5.5729  data: 0.0001  max mem: 71357
+[09:32:26.749437] Epoch: [3]  [6420/6500]  lr: 0.000005  closs: 0.7601 (0.7419)  grad_norm: 0.4596 (0.4761)  time: 5.5745  data: 0.0001  max mem: 71357
+[09:33:22.454703] Epoch: [3]  [6430/6500]  lr: 0.000005  closs: 0.7906 (0.7420)  grad_norm: 0.4588 (0.4761)  time: 5.5730  data: 0.0001  max mem: 71357
+[09:34:18.165306] Epoch: [3]  [6440/6500]  lr: 0.000005  closs: 0.7978 (0.7421)  grad_norm: 0.4588 (0.4762)  time: 5.5707  data: 0.0001  max mem: 71357
+[09:35:13.814875] Epoch: [3]  [6450/6500]  lr: 0.000005  closs: 0.8280 (0.7422)  grad_norm: 0.4473 (0.4763)  time: 5.5679  data: 0.0001  max mem: 71357
+[09:36:09.491251] Epoch: [3]  [6460/6500]  lr: 0.000005  closs: 0.7346 (0.7422)  grad_norm: 0.4588 (0.4768)  time: 5.5662  data: 0.0001  max mem: 71357
+[09:37:05.187478] Epoch: [3]  [6470/6500]  lr: 0.000005  closs: 0.7346 (0.7421)  grad_norm: 0.4473 (0.4767)  time: 5.5685  data: 0.0001  max mem: 71357
+[09:38:00.919941] Epoch: [3]  [6480/6500]  lr: 0.000005  closs: 0.6989 (0.7421)  grad_norm: 0.4163 (0.4766)  time: 5.5713  data: 0.0001  max mem: 71357
+[09:38:56.716451] Epoch: [3]  [6490/6500]  lr: 0.000005  closs: 0.7505 (0.7422)  grad_norm: 0.4275 (0.4767)  time: 5.5763  data: 0.0001  max mem: 71357
+[09:39:47.308149] Epoch: [3] Total time: 10:03:39
+[09:39:47.344265] Averaged stats: lr: 0.000005  closs: 0.7505 (0.7420)  grad_norm: 0.4643 (0.4769)
+[09:39:47.504858] model saved
+[09:39:48.501829] optimizer saved
+[09:39:48.502389] other rank-common saved
+[09:39:48.505739] rank-specific saved
+[09:39:48.505904] Training time 1 day, 16:16:17
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b38695565af72457e9f05c9e4ca22d0b989e9a8c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:313f5e81fea27d147cde97e00126a7f8601b23525257ebf7458c92eecf0cc970
+size 3346363
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d4696f71df544953bf208fff77e98e6459d6f009
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11c2c3bf6d2f02c6c58b38aea73b6f05069f5cb244a15495d6d0fb38005daee7
+size 13213149
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4f1c0fc1c5869d93ad63691f8b563ddfc94a491e
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bf124baa9c1335af5aea4800b2ecc4751c23fe1e00b1039e65300f3a8a604df
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..56aae264f8bb29bb559083b58986da1e9fb59b05
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cc391cf566e6ffa902c1bce14c8e49dfd776d4a477e9db68ddb38586324df74
+size 3346363
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3c80900041c5cb481f436a046d4228a25f84f81c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c008837893b85ed8ec5956d6325baad2bbb37f2ff9292343f070f45c5a214e
+size 13213149
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5aedb034e6703bb173a5809ba3841b4aec77b84f
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66c2800e6ca30ebd241d0f9e1eb878c8cefbe4b64abe9193d161a46e70ab4a7f
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dc4d43c9d981f1347aacc56e7915534f880e916e
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15fc9177852922041cbd21983bf3035d7a369572d47f5d3c59a95a96157645a6
+size 3346363
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e25b2de607363f557126bbe17c77377ff503e06d
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba98b1d4711d2593bda512d80ff5dde0f7b0f59279b684d03bb9d176d451d23c
+size 13213149
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8dee9756db1f953b5909c16e290ec296695d713e
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:345734640b889b71c244859184f43ef9df7cdc2bb5b54df1b067c3eb5926cad0
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c41978c5774d772f60e84e9353d71223f4fe4cb1
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e603f33e4d2bb53c22e63aec2ab119a4eb5fa43229f5378fd59d317b09db95e
+size 3346363
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth
new file mode 100644
index 0000000000000000000000000000000000000000..001cee0a11609de544c4911f87ef6c06adfff2e6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:796eb49d927deacfcebdb9e6a7f7410c3c039a8d122dfb050a99bb689f20f1aa
+size 13213149
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3eb817d0ed9c35f2a5b4064f75035754d00a8e8a
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80dc16b24ea013668c3f644439b10bf3c06724b5ab5982e34f615f5742cbffc9
+size 1687
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce
+size 537
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abb77c76056fe3af1b2bf4deef0701efd4b3bf43
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt
@@ -0,0 +1,4 @@
+{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.5515950994171532, "train_closs": 0.9724007532174878, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.5515950994171532, "val_closs": 0.9724007532174878}
+{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.4230820783253374, "train_closs": 0.9194491385009709, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.4230820783253374, "val_closs": 0.9194491385009709}
+{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.43375423697415244, "train_closs": 0.9097992562197993, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.43375423697415244, "val_closs": 0.9097992562197993}
+{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.4319171491881897, "train_closs": 0.9059112436445476, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.4319171491881897, "val_closs": 0.9059112436445476}
diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..2d61df6ec6a44103476fd8b35eaf94407dd1c73e
--- /dev/null
+++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log
@@ -0,0 +1,591 @@
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+| distributed init (rank 2): env://, gpu 2
+| distributed init (rank 4): env://, gpu 4
+| distributed init (rank 6): env://, gpu 6
+| distributed init (rank 1): env://, gpu 1
+| distributed init (rank 7): env://, gpu 7
+| distributed init (rank 0): env://, gpu 0
+| distributed init (rank 5): env://, gpu 5
+| distributed init (rank 3): env://, gpu 3
+[01:06:34.716609] > initializing model parallel with size 1
+[01:06:34.716724] > initializing ddp with size 8
+[01:06:34.716732] > initializing pipeline with size 1
+[01:06:34.874304] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory
+[01:06:34.874416] Namespace(batch_size=8,
+accum_iter=1,
+llama_type='llama_peft',
+llama_config=['../checkpoints/llama2/Llama-2-7b/params.json'],
+no_visual=True,
+tokenizer_path='../checkpoints/llama2/Llama-2-7b/tokenizer.model',
+pretrained_path='../checkpoints/llama2/Llama-2-7b/',
+pretrained_type='meta_ori',
+weight_decay=0.02,
+lr=5e-05,
+min_lr=5e-06,
+epochs=4,
+warmup_epochs=1.0,
+clip_grad=2,
+max_words=512,
+dialog=False,
+data_config='configs/data/finetune/sg/alpaca.yaml',
+output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B',
+log_dir='./output_dir',
+save_interval=1,
+only_save_trainable=True,
+device='cuda',
+seed=0,
+resume='',
+num_workers=24,
+pin_mem=True,
+world_size=8,
+local_rank=-1,
+dist_on_itp=False,
+dist_url='env://',
+model_parallel_size=1,
+data_parallel='sdp',
+precision='bf16',
+checkpointing=True,
+quant=True,
+rank=0,
+gpu=0,
+distributed=True,
+dist_backend='nccl')
+[01:06:34.875336] Start initialization.
+[01:06:34.875369] ## Processing on RANK 0.
+[01:06:34.885417] Model Args:
+ ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True)
+[01:07:21.595514] Model is Peft: True
+[01:07:21.598774] Trainable parameter count : 1626112 (local rank), 1626112 (all).
+[01:07:21.606589] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:07:36.364995] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 75.93it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:34, 11.01it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 13.94it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:27, 13.54it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:45,  8.03it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:31, 11.53it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:32, 10.98it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:40,  8.62it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:42,  8.27it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.48it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:31, 10.83it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.75it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:48,  6.94it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:29, 11.25it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:30, 10.72it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:34,  9.67it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:47,  6.89it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:28, 11.22it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:29, 10.69it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:32,  9.66it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:45,  6.90it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:27, 11.23it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:28, 10.70it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:31,  9.66it/s]Qunatization Process:  22%|██▏       | 87/391 [00:08<00:43,  6.93it/s]Qunatization Process:  24%|██▎       | 92/391 [00:08<00:26, 11.22it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.54it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.41it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.68it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.86it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.27it/s]Qunatization Process:  28%|██▊       | 109/391 [00:10<00:30,  9.27it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:42,  6.61it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:25, 10.76it/s]Qunatization Process:  30%|███       | 119/391 [00:11<00:26, 10.24it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:29,  9.25it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:40,  6.60it/s]Qunatization Process:  33%|███▎      | 128/391 [00:12<00:24, 10.75it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:25, 10.23it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.22it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:38,  6.60it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:23, 10.74it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:24, 10.20it/s]Qunatization Process:  37%|███▋      | 145/391 [00:14<00:26,  9.23it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:36,  6.60it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:22, 10.72it/s]Qunatization Process:  40%|███▉      | 155/391 [00:15<00:23, 10.21it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:25,  9.23it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:35,  6.59it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:21, 10.73it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.22it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:24,  9.24it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:32,  6.73it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:19, 10.99it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:20, 10.55it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:21,  9.56it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:30,  6.89it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 11.23it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:18, 10.68it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:20,  9.67it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:28,  6.94it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:16, 11.30it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:17, 10.73it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:19,  9.71it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:21<00:26,  6.94it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:15, 11.30it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:16, 10.75it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:17,  9.72it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:24,  6.95it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:14, 11.32it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:15, 10.77it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:23<00:16,  9.71it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:23,  6.95it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:13, 11.30it/s]Qunatization Process:  61%|██████    | 239/391 [00:24<00:14, 10.68it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.67it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:25<00:21,  6.93it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:25<00:12, 11.29it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.72it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:14,  9.70it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:19,  6.93it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.29it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:11, 10.75it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:27<00:12,  9.72it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:17,  6.93it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:10, 11.29it/s]Qunatization Process:  70%|███████   | 275/391 [00:28<00:10, 10.75it/s]Qunatization Process:  71%|███████   | 277/391 [00:28<00:11,  9.69it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:29<00:16,  6.94it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:29<00:09, 11.30it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:29<00:09, 10.73it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:10,  9.70it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:30<00:14,  6.95it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:30<00:08, 11.31it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:08, 10.73it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:31<00:09,  9.70it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:12,  6.93it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 11.28it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:32<00:07, 10.73it/s]Qunatization Process:  80%|████████  | 313/391 [00:32<00:08,  9.71it/s]Qunatization Process:  81%|████████  | 315/391 [00:33<00:10,  6.93it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:33<00:06, 11.29it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:33<00:06, 10.74it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.68it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:34<00:09,  6.93it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:34<00:05, 11.29it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.71it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:35<00:05,  9.67it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:35<00:07,  6.77it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:35<00:04, 10.96it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:36<00:04, 10.35it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:36<00:04,  9.32it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:37<00:06,  6.62it/s]Qunatization Process:  91%|█████████ | 356/391 [00:37<00:03, 10.77it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:37<00:03, 10.24it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:37<00:03,  9.22it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:38<00:04,  6.60it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:38<00:02, 10.74it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:38<00:01, 10.19it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:39<00:01,  9.21it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:39<00:02,  6.58it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:39<00:01, 10.71it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:40<00:00, 10.19it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:40<00:00,  9.21it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:41<00:00,  6.58it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  5.65it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  9.29it/s]
+[01:08:28.227538] ## Processing on RANK 1.
+[01:09:15.307640] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:09:31.571276] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 73.29it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:33, 11.08it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 14.05it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:26, 13.71it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:44,  8.22it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:30, 11.78it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:31, 11.18it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:41,  8.59it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:42,  8.18it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.37it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:31, 10.75it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.58it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:50,  6.77it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:30, 11.02it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:31, 10.52it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:35,  9.42it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:48,  6.70it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:29, 10.94it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:30, 10.47it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:33,  9.39it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:47,  6.69it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:28, 10.92it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:29, 10.43it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:32,  9.39it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:45,  6.69it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.92it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.43it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.38it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.69it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.92it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.44it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:29,  9.42it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.74it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:25, 10.99it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:25, 10.48it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.46it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:39,  6.73it/s]Qunatization Process:  33%|███▎      | 128/391 [00:13<00:24, 10.92it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:25, 10.28it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:28,  9.18it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:39,  6.47it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:23, 10.55it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:24, 10.06it/s]Qunatization Process:  37%|███▋      | 145/391 [00:15<00:27,  9.04it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:38,  6.42it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:22, 10.47it/s]Qunatization Process:  40%|███▉      | 155/391 [00:16<00:23, 10.01it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:25,  9.01it/s]Qunatization Process:  41%|████      | 159/391 [00:17<00:36,  6.41it/s]Qunatization Process:  42%|████▏     | 164/391 [00:17<00:21, 10.45it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:22,  9.99it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:24,  9.00it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:34,  6.40it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:20, 10.44it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:21,  9.98it/s]Qunatization Process:  46%|████▋     | 181/391 [00:19<00:23,  8.99it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:32,  6.39it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:19, 10.44it/s]Qunatization Process:  49%|████▉     | 191/391 [00:20<00:20,  9.89it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:22,  8.91it/s]Qunatization Process:  50%|████▉     | 195/391 [00:21<00:30,  6.38it/s]Qunatization Process:  51%|█████     | 200/391 [00:21<00:18, 10.41it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:18,  9.97it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:20,  8.96it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:22<00:28,  6.39it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:17, 10.43it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:17,  9.95it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:23<00:19,  8.97it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:26,  6.43it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 10.54it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:24<00:16, 10.19it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:24<00:17,  9.26it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:25<00:23,  6.67it/s]Qunatization Process:  60%|██████    | 236/391 [00:25<00:14, 10.89it/s]Qunatization Process:  61%|██████    | 239/391 [00:25<00:14, 10.42it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.41it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:26<00:21,  6.73it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:26<00:13, 10.98it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.48it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:27<00:14,  9.45it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:20,  6.75it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.00it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:28<00:12, 10.52it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:28<00:13,  9.46it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:29<00:18,  6.73it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:29<00:10, 10.97it/s]Qunatization Process:  70%|███████   | 275/391 [00:29<00:11, 10.50it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:12,  9.46it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:30<00:16,  6.73it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:30<00:09, 10.99it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:30<00:09, 10.51it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:31<00:10,  9.47it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:31<00:14,  6.75it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:31<00:08, 11.02it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:32<00:08, 10.54it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:32<00:09,  9.48it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:13,  6.75it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:33<00:07, 11.01it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:33<00:07, 10.53it/s]Qunatization Process:  80%|████████  | 313/391 [00:33<00:08,  9.48it/s]Qunatization Process:  81%|████████  | 315/391 [00:34<00:11,  6.78it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:34<00:06, 11.03it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:34<00:06, 10.51it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.53it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:35<00:09,  6.80it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:35<00:05, 11.06it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.57it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:36<00:05,  9.57it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:36<00:07,  6.78it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 11.07it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:37<00:04, 10.52it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:37<00:04,  9.33it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:38<00:06,  6.53it/s]Qunatization Process:  91%|█████████ | 356/391 [00:38<00:03, 10.61it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:38<00:03,  9.95it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:38<00:03,  8.95it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:39<00:04,  6.38it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:39<00:02, 10.39it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:39<00:02,  9.89it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:40<00:02,  8.99it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:40<00:02,  6.48it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:40<00:01, 10.56it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:41<00:00, 10.00it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:41<00:00,  8.92it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:42<00:00,  6.40it/s]Qunatization Process: 100%|██████████| 391/391 [00:43<00:00,  5.48it/s]Qunatization Process: 100%|██████████| 391/391 [00:43<00:00,  9.07it/s]
+[01:10:25.868268] ## Processing on RANK 2.
+[01:11:13.020280] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:11:28.712326] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 71.16it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:33, 11.13it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 14.13it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:26, 13.84it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:44,  8.25it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:30, 11.83it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:31, 11.25it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:40,  8.67it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:42,  8.26it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.50it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:31, 10.86it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.67it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:49,  6.83it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:30, 11.12it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:31, 10.47it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:35,  9.29it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:50,  6.51it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:30, 10.60it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:31, 10.10it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:35,  9.05it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:49,  6.42it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:29, 10.51it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:30, 10.21it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:33,  9.19it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:45,  6.63it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.86it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.45it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.42it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.70it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.93it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.51it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:29,  9.48it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.77it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:24, 11.02it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:25, 10.57it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.52it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:39,  6.75it/s]Qunatization Process:  33%|███▎      | 128/391 [00:13<00:23, 11.02it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:24, 10.58it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.53it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:37,  6.80it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:22, 11.10it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:23, 10.57it/s]Qunatization Process:  37%|███▋      | 145/391 [00:14<00:25,  9.51it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:35,  6.79it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:21, 11.08it/s]Qunatization Process:  40%|███▉      | 155/391 [00:15<00:22, 10.62it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:24,  9.56it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:34,  6.81it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:20, 11.12it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.64it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:23,  9.57it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:32,  6.82it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:19, 11.12it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:19, 10.64it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:21,  9.55it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:30,  6.81it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 11.11it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:18, 10.64it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:20,  9.57it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:28,  6.77it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:17, 11.06it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:17, 10.59it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:19,  9.52it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:21<00:27,  6.79it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:16, 11.08it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:16, 10.61it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:18,  9.53it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:25,  6.75it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 11.02it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:15, 10.56it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:23<00:17,  9.52it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:23,  6.78it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:14, 11.06it/s]Qunatization Process:  61%|██████    | 239/391 [00:24<00:14, 10.54it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.50it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:25<00:21,  6.78it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:25<00:12, 11.07it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.59it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:14,  9.54it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:20,  6.79it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.09it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:12, 10.61it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:27<00:13,  9.53it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:18,  6.79it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:10, 11.09it/s]Qunatization Process:  70%|███████   | 275/391 [00:28<00:10, 10.60it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:11,  9.53it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:29<00:16,  6.79it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:29<00:09, 11.08it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:30<00:09, 10.60it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:10,  9.54it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:30<00:14,  6.75it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:31<00:08, 11.00it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:08, 10.54it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:31<00:09,  9.55it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:12,  6.84it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 11.12it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:32<00:07, 10.57it/s]Qunatization Process:  80%|████████  | 313/391 [00:32<00:08,  9.50it/s]Qunatization Process:  81%|████████  | 315/391 [00:33<00:11,  6.82it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:33<00:06, 11.13it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:33<00:06, 10.62it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.56it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:34<00:09,  6.73it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:34<00:05, 10.90it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.22it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:35<00:05,  9.13it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:36<00:08,  6.41it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 10.44it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:36<00:04,  9.94it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:36<00:04,  8.96it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:37<00:06,  6.40it/s]Qunatization Process:  91%|█████████ | 356/391 [00:37<00:03, 10.43it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:38<00:03,  9.85it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:38<00:03,  8.90it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:38<00:04,  6.39it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:39<00:02, 10.43it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:39<00:02,  9.97it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:39<00:02,  8.99it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:40<00:02,  6.42it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:40<00:01, 10.49it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:40<00:00, 10.04it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:41<00:00,  9.09it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:41<00:00,  6.48it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  5.47it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  9.18it/s]
+[01:12:23.035165] ## Processing on RANK 3.
+[01:13:10.338257] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:13:27.463168] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 72.65it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:35, 10.68it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:27, 13.53it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:27, 13.16it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:46,  7.81it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:32, 11.20it/s]Qunatization Process:   9%|▉         | 35/391 [00:03<00:33, 10.67it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:42,  8.38it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:43,  8.05it/s]Qunatization Process:  11%|█▏        | 44/391 [00:04<00:31, 11.17it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:32, 10.59it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.52it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:51,  6.59it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:31, 10.67it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:32, 10.13it/s]Qunatization Process:  16%|█▌        | 61/391 [00:06<00:36,  9.14it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:50,  6.52it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:30, 10.61it/s]Qunatization Process:  18%|█▊        | 71/391 [00:07<00:31, 10.11it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:34,  9.13it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:48,  6.51it/s]Qunatization Process:  20%|██        | 80/391 [00:08<00:29, 10.61it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:30, 10.12it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:33,  9.07it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:46,  6.50it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:28, 10.59it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:29, 10.12it/s]Qunatization Process:  25%|██▍       | 97/391 [00:10<00:32,  9.14it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:45,  6.47it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:27, 10.55it/s]Qunatization Process:  27%|██▋       | 107/391 [00:11<00:28,  9.96it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:31,  9.04it/s]Qunatization Process:  28%|██▊       | 111/391 [00:12<00:43,  6.46it/s]Qunatization Process:  30%|██▉       | 116/391 [00:12<00:26, 10.54it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:27, 10.05it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:29,  9.12it/s]Qunatization Process:  31%|███▏      | 123/391 [00:13<00:41,  6.49it/s]Qunatization Process:  33%|███▎      | 128/391 [00:13<00:24, 10.52it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:25, 10.05it/s]Qunatization Process:  34%|███▍      | 133/391 [00:14<00:28,  9.10it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:39,  6.49it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:23, 10.56it/s]Qunatization Process:  37%|███▋      | 143/391 [00:15<00:24, 10.07it/s]Qunatization Process:  37%|███▋      | 145/391 [00:15<00:27,  9.08it/s]Qunatization Process:  38%|███▊      | 147/391 [00:16<00:37,  6.48it/s]Qunatization Process:  39%|███▉      | 152/391 [00:16<00:22, 10.57it/s]Qunatization Process:  40%|███▉      | 155/391 [00:16<00:23, 10.08it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:25,  9.14it/s]Qunatization Process:  41%|████      | 159/391 [00:17<00:35,  6.56it/s]Qunatization Process:  42%|████▏     | 164/391 [00:17<00:21, 10.69it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.18it/s]Qunatization Process:  43%|████▎     | 169/391 [00:18<00:24,  9.21it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:33,  6.58it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:20, 10.72it/s]Qunatization Process:  46%|████▌     | 179/391 [00:19<00:20, 10.20it/s]Qunatization Process:  46%|████▋     | 181/391 [00:19<00:22,  9.22it/s]Qunatization Process:  47%|████▋     | 183/391 [00:20<00:31,  6.59it/s]Qunatization Process:  48%|████▊     | 188/391 [00:20<00:18, 10.74it/s]Qunatization Process:  49%|████▉     | 191/391 [00:20<00:19, 10.21it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:21,  9.20it/s]Qunatization Process:  50%|████▉     | 195/391 [00:21<00:31,  6.30it/s]Qunatization Process:  51%|█████     | 200/391 [00:21<00:18, 10.17it/s]Qunatization Process:  52%|█████▏    | 202/391 [00:21<00:19,  9.93it/s]Qunatization Process:  52%|█████▏    | 204/391 [00:21<00:16, 11.08it/s]Qunatization Process:  53%|█████▎    | 206/391 [00:22<00:25,  7.13it/s]Qunatization Process:  53%|█████▎    | 208/391 [00:22<00:25,  7.04it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:16, 10.57it/s]Qunatization Process:  55%|█████▍    | 214/391 [00:23<00:17,  9.95it/s]Qunatization Process:  55%|█████▌    | 216/391 [00:23<00:16, 10.92it/s]Qunatization Process:  56%|█████▌    | 218/391 [00:23<00:27,  6.34it/s]Qunatization Process:  56%|█████▋    | 220/391 [00:24<00:26,  6.46it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:24<00:16, 10.03it/s]Qunatization Process:  58%|█████▊    | 226/391 [00:24<00:16,  9.75it/s]Qunatization Process:  58%|█████▊    | 228/391 [00:24<00:14, 11.02it/s]Qunatization Process:  59%|█████▉    | 230/391 [00:25<00:23,  6.81it/s]Qunatization Process:  59%|█████▉    | 232/391 [00:25<00:23,  6.70it/s]Qunatization Process:  60%|██████    | 236/391 [00:25<00:14, 10.46it/s]Qunatization Process:  61%|██████    | 238/391 [00:25<00:15, 10.12it/s]Qunatization Process:  61%|██████▏   | 240/391 [00:26<00:13, 11.30it/s]Qunatization Process:  62%|██████▏   | 242/391 [00:26<00:21,  6.94it/s]Qunatization Process:  62%|██████▏   | 244/391 [00:26<00:21,  6.90it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:27<00:13, 10.74it/s]Qunatization Process:  64%|██████▍   | 250/391 [00:27<00:13, 10.30it/s]Qunatization Process:  64%|██████▍   | 252/391 [00:27<00:11, 11.59it/s]Qunatization Process:  65%|██████▍   | 254/391 [00:27<00:19,  7.05it/s]Qunatization Process:  65%|██████▌   | 256/391 [00:28<00:19,  6.98it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:28<00:12, 10.86it/s]Qunatization Process:  67%|██████▋   | 262/391 [00:28<00:12, 10.23it/s]Qunatization Process:  68%|██████▊   | 264/391 [00:28<00:11, 11.53it/s]Qunatization Process:  68%|██████▊   | 266/391 [00:29<00:18,  6.94it/s]Qunatization Process:  69%|██████▊   | 268/391 [00:29<00:17,  6.90it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:29<00:11, 10.75it/s]Qunatization Process:  70%|███████   | 274/391 [00:29<00:11, 10.32it/s]Qunatization Process:  71%|███████   | 276/391 [00:30<00:09, 11.61it/s]Qunatization Process:  71%|███████   | 278/391 [00:30<00:16,  7.00it/s]Qunatization Process:  72%|███████▏  | 280/391 [00:30<00:16,  6.93it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:31<00:09, 10.75it/s]Qunatization Process:  73%|███████▎  | 286/391 [00:31<00:10, 10.27it/s]Qunatization Process:  74%|███████▎  | 288/391 [00:31<00:08, 11.55it/s]Qunatization Process:  74%|███████▍  | 290/391 [00:31<00:14,  6.99it/s]Qunatization Process:  75%|███████▍  | 292/391 [00:32<00:14,  6.92it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:32<00:08, 10.76it/s]Qunatization Process:  76%|███████▌  | 298/391 [00:32<00:09, 10.33it/s]Qunatization Process:  77%|███████▋  | 300/391 [00:32<00:07, 11.60it/s]Qunatization Process:  77%|███████▋  | 302/391 [00:33<00:13,  6.82it/s]Qunatization Process:  78%|███████▊  | 304/391 [00:33<00:12,  6.75it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:33<00:07, 10.52it/s]Qunatization Process:  79%|███████▉  | 310/391 [00:33<00:08,  9.96it/s]Qunatization Process:  80%|███████▉  | 312/391 [00:34<00:07, 11.24it/s]Qunatization Process:  80%|████████  | 314/391 [00:34<00:11,  6.74it/s]Qunatization Process:  81%|████████  | 316/391 [00:35<00:11,  6.69it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:35<00:06, 10.42it/s]Qunatization Process:  82%|████████▏ | 322/391 [00:35<00:06,  9.99it/s]Qunatization Process:  83%|████████▎ | 324/391 [00:35<00:05, 11.27it/s]Qunatization Process:  83%|████████▎ | 326/391 [00:36<00:09,  6.82it/s]Qunatization Process:  84%|████████▍ | 328/391 [00:36<00:09,  6.75it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:36<00:05, 10.51it/s]Qunatization Process:  85%|████████▌ | 334/391 [00:36<00:05, 10.12it/s]Qunatization Process:  86%|████████▌ | 336/391 [00:36<00:04, 11.38it/s]Qunatization Process:  86%|████████▋ | 338/391 [00:37<00:07,  6.86it/s]Qunatization Process:  87%|████████▋ | 340/391 [00:37<00:07,  6.76it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:37<00:04, 10.54it/s]Qunatization Process:  88%|████████▊ | 346/391 [00:38<00:04, 10.14it/s]Qunatization Process:  89%|████████▉ | 348/391 [00:38<00:03, 11.41it/s]Qunatization Process:  90%|████████▉ | 350/391 [00:38<00:05,  6.86it/s]Qunatization Process:  90%|█████████ | 352/391 [00:39<00:05,  6.73it/s]Qunatization Process:  91%|█████████ | 356/391 [00:39<00:03, 10.50it/s]Qunatization Process:  92%|█████████▏| 358/391 [00:39<00:03, 10.10it/s]Qunatization Process:  92%|█████████▏| 360/391 [00:39<00:02, 11.38it/s]Qunatization Process:  93%|█████████▎| 362/391 [00:40<00:04,  6.85it/s]Qunatization Process:  93%|█████████▎| 364/391 [00:40<00:03,  6.76it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:40<00:02, 10.42it/s]Qunatization Process:  95%|█████████▍| 370/391 [00:40<00:02, 10.05it/s]Qunatization Process:  95%|█████████▌| 372/391 [00:40<00:01, 11.33it/s]Qunatization Process:  96%|█████████▌| 374/391 [00:41<00:02,  6.83it/s]Qunatization Process:  96%|█████████▌| 376/391 [00:41<00:02,  6.74it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:41<00:01, 10.50it/s]Qunatization Process:  98%|█████████▊| 382/391 [00:42<00:00, 10.11it/s]Qunatization Process:  98%|█████████▊| 384/391 [00:42<00:00, 11.30it/s]Qunatization Process:  99%|█████████▊| 386/391 [00:42<00:00,  6.37it/s]Qunatization Process:  99%|█████████▉| 388/391 [00:43<00:00,  6.41it/s]Qunatization Process: 100%|██████████| 391/391 [00:44<00:00,  4.75it/s]Qunatization Process: 100%|██████████| 391/391 [00:44<00:00,  8.84it/s]
+[01:14:23.239756] ## Processing on RANK 4.
+[01:15:10.375161] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:15:26.154115] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 72.98it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:33, 11.14it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 14.14it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:26, 13.83it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:43,  8.28it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:30, 11.87it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:31, 11.27it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:40,  8.66it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:42,  8.25it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.48it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:31, 10.85it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.66it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:50,  6.80it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:30, 11.06it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:31, 10.58it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:34,  9.49it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:48,  6.74it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:29, 10.99it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:30, 10.53it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:33,  9.46it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:47,  6.72it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:28, 10.98it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:29, 10.52it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:32,  9.45it/s]Qunatization Process:  22%|██▏       | 87/391 [00:08<00:45,  6.70it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.95it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.50it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.44it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.71it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.96it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.51it/s]Qunatization Process:  28%|██▊       | 109/391 [00:10<00:29,  9.46it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.74it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:24, 11.01it/s]Qunatization Process:  30%|███       | 119/391 [00:11<00:25, 10.55it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.50it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:39,  6.77it/s]Qunatization Process:  33%|███▎      | 128/391 [00:12<00:23, 11.06it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:24, 10.60it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.53it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:37,  6.79it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:22, 11.08it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:23, 10.61it/s]Qunatization Process:  37%|███▋      | 145/391 [00:14<00:25,  9.54it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:36,  6.77it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:21, 11.06it/s]Qunatization Process:  40%|███▉      | 155/391 [00:15<00:22, 10.60it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:24,  9.53it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:34,  6.78it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:20, 11.08it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.61it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:23,  9.54it/s]Qunatization Process:  44%|████▎     | 171/391 [00:17<00:32,  6.79it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:19, 11.08it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:19, 10.61it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:22,  9.54it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:30,  6.72it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 10.92it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:19, 10.32it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:21,  9.21it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:30,  6.50it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:18, 10.59it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:18, 10.08it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:20,  9.05it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:21<00:28,  6.49it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:16, 10.65it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:17, 10.32it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:18,  9.35it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:25,  6.71it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 10.97it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:15, 10.53it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:23<00:17,  9.49it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:23,  6.77it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:14, 11.05it/s]Qunatization Process:  61%|██████    | 239/391 [00:24<00:14, 10.58it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.52it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:25<00:21,  6.78it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:25<00:12, 11.06it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.59it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:14,  9.53it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:20,  6.77it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.05it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:12, 10.58it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:27<00:13,  9.52it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:18,  6.78it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:10, 11.07it/s]Qunatization Process:  70%|███████   | 275/391 [00:28<00:10, 10.57it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:11,  9.50it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:29<00:16,  6.76it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:29<00:09, 11.03it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:30<00:09, 10.54it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:10,  9.48it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:31<00:14,  6.75it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:31<00:08, 11.02it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:08, 10.53it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:31<00:09,  9.48it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:13,  6.75it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 11.02it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:32<00:07, 10.53it/s]Qunatization Process:  80%|████████  | 313/391 [00:33<00:08,  9.46it/s]Qunatization Process:  81%|████████  | 315/391 [00:33<00:11,  6.75it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:33<00:06, 11.02it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:34<00:06, 10.56it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.51it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:34<00:09,  6.77it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:34<00:05, 11.06it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.58it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:35<00:05,  9.52it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:36<00:07,  6.77it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 11.05it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:36<00:04, 10.57it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:36<00:04,  9.51it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:37<00:05,  6.77it/s]Qunatization Process:  91%|█████████ | 356/391 [00:37<00:03, 11.05it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:37<00:03, 10.58it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:38<00:03,  9.52it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:38<00:04,  6.77it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:38<00:02, 11.00it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:39<00:01, 10.52it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:39<00:01,  9.48it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:40<00:02,  6.76it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:40<00:00, 11.05it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:40<00:00, 10.58it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:40<00:00,  9.52it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:41<00:00,  6.78it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  5.77it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  9.27it/s]
+[01:16:18.508862] ## Processing on RANK 5.
+[01:17:05.539357] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:17:21.004893] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 73.57it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:33, 11.14it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 14.14it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:26, 13.84it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:43,  8.30it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:30, 11.89it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:31, 11.29it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:40,  8.68it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:42,  8.26it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.50it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:32, 10.71it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:36,  9.40it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:51,  6.55it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:31, 10.68it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:32, 10.33it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:35,  9.34it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:49,  6.69it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:29, 10.93it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:30, 10.50it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:33,  9.45it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:46,  6.73it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:28, 11.00it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:29, 10.54it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:32,  9.45it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:45,  6.72it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.98it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.52it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.46it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.72it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.98it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.50it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:29,  9.46it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.75it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:24, 11.02it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:25, 10.56it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.52it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:39,  6.79it/s]Qunatization Process:  33%|███▎      | 128/391 [00:12<00:23, 11.08it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:24, 10.62it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.55it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:37,  6.80it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:22, 11.11it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:23, 10.63it/s]Qunatization Process:  37%|███▋      | 145/391 [00:14<00:25,  9.56it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:35,  6.80it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:21, 11.11it/s]Qunatization Process:  40%|███▉      | 155/391 [00:15<00:22, 10.64it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:24,  9.56it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:34,  6.81it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:20, 11.11it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.64it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:23,  9.57it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:32,  6.81it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:19, 11.12it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:19, 10.64it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:22,  9.54it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:30,  6.79it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 11.09it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:18, 10.61it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:20,  9.55it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:28,  6.80it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:17, 11.09it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:17, 10.59it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:19,  9.53it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:21<00:27,  6.79it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:21<00:16, 11.08it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:16, 10.60it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:18,  9.54it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:25,  6.79it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 11.09it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:15, 10.62it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:23<00:16,  9.55it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:23,  6.80it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:13, 11.10it/s]Qunatization Process:  61%|██████    | 239/391 [00:24<00:14, 10.63it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.56it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:25<00:21,  6.80it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:25<00:12, 11.11it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.62it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:14,  9.56it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:19,  6.80it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.10it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:12, 10.62it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:27<00:13,  9.56it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:18,  6.79it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:10, 11.09it/s]Qunatization Process:  70%|███████   | 275/391 [00:28<00:10, 10.61it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:11,  9.52it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:29<00:16,  6.79it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:29<00:09, 11.08it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:29<00:09, 10.62it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:10,  9.55it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:30<00:14,  6.80it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:30<00:08, 11.10it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:08, 10.63it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:31<00:09,  9.56it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:12,  6.80it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 11.11it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:32<00:07, 10.63it/s]Qunatization Process:  80%|████████  | 313/391 [00:32<00:08,  9.56it/s]Qunatization Process:  81%|████████  | 315/391 [00:33<00:11,  6.80it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:33<00:06, 11.10it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:33<00:06, 10.63it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.55it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:34<00:09,  6.80it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:34<00:05, 11.09it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.62it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:35<00:05,  9.55it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:35<00:07,  6.80it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 11.10it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:36<00:04, 10.63it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:36<00:04,  9.56it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:37<00:05,  6.79it/s]Qunatization Process:  91%|█████████ | 356/391 [00:37<00:03, 11.08it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:37<00:03, 10.61it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:37<00:03,  9.55it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:38<00:04,  6.80it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:38<00:02, 11.10it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:38<00:01, 10.62it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:39<00:01,  9.55it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:39<00:02,  6.80it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:39<00:00, 11.10it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:40<00:00, 10.60it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:40<00:00,  9.54it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:41<00:00,  6.79it/s]Qunatization Process: 100%|██████████| 391/391 [00:41<00:00,  5.77it/s]Qunatization Process: 100%|██████████| 391/391 [00:41<00:00,  9.31it/s]
+[01:18:12.556806] ## Processing on RANK 6.
+[01:18:59.265791] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:19:12.653165] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 75.93it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:34, 10.98it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 13.91it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:27, 13.54it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:45,  8.05it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:31, 11.52it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:32, 10.98it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:43,  8.06it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:44,  7.85it/s]Qunatization Process:  11%|█▏        | 44/391 [00:04<00:31, 10.92it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:32, 10.43it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.50it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:49,  6.86it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:30, 11.13it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:31, 10.56it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:34,  9.58it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:48,  6.82it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:29, 11.09it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:30, 10.53it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:33,  9.41it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:47,  6.65it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:28, 10.83it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:29, 10.30it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:32,  9.28it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:46,  6.60it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.76it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:29, 10.19it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:32,  9.16it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:44,  6.52it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:27, 10.62it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:28, 10.12it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:30,  9.22it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.74it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:25, 11.00it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:25, 10.55it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.55it/s]Qunatization Process:  31%|███▏      | 123/391 [00:13<00:39,  6.82it/s]Qunatization Process:  33%|███▎      | 128/391 [00:13<00:23, 11.11it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:24, 10.55it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.50it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:37,  6.84it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:22, 11.15it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:23, 10.66it/s]Qunatization Process:  37%|███▋      | 145/391 [00:15<00:25,  9.65it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:35,  6.91it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:21, 11.21it/s]Qunatization Process:  40%|███▉      | 155/391 [00:16<00:22, 10.65it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:24,  9.60it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:33,  6.87it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:20, 11.16it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:21, 10.63it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:23,  9.58it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:32,  6.86it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:19, 11.15it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:19, 10.62it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:21,  9.57it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:30,  6.84it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 11.16it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:18, 10.60it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:20,  9.61it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:28,  6.91it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:16, 11.25it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:17, 10.69it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:19,  9.65it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:21<00:26,  6.87it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:16, 11.17it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:16, 10.51it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:18,  9.36it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:25,  6.62it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 10.77it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:16, 10.19it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:24<00:17,  9.18it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:24,  6.53it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:14, 10.62it/s]Qunatization Process:  61%|██████    | 239/391 [00:25<00:15, 10.11it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:16,  9.11it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:26<00:22,  6.47it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:26<00:13, 10.54it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.08it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:15,  9.14it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:20,  6.56it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:12, 10.68it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:12, 10.14it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:28<00:13,  9.09it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:18,  6.54it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:11, 10.65it/s]Qunatization Process:  70%|███████   | 275/391 [00:29<00:11, 10.15it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:12,  9.07it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:30<00:17,  6.53it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:30<00:10, 10.64it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:30<00:10, 10.14it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:11,  9.18it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:31<00:15,  6.51it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:31<00:08, 10.62it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:09, 10.11it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:32<00:09,  9.15it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:13,  6.54it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 10.65it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:33<00:07, 10.13it/s]Qunatization Process:  80%|████████  | 313/391 [00:33<00:08,  9.15it/s]Qunatization Process:  81%|████████  | 315/391 [00:34<00:11,  6.55it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:34<00:06, 10.66it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:34<00:06, 10.14it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:07,  9.15it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:35<00:09,  6.55it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:35<00:05, 10.54it/s]Qunatization Process:  85%|████████▌ | 334/391 [00:35<00:05, 10.18it/s]Qunatization Process:  86%|████████▌ | 336/391 [00:35<00:04, 11.31it/s]Qunatization Process:  86%|████████▋ | 338/391 [00:36<00:07,  7.10it/s]Qunatization Process:  87%|████████▋ | 340/391 [00:36<00:07,  7.01it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 10.66it/s]Qunatization Process:  88%|████████▊ | 346/391 [00:37<00:04, 10.13it/s]Qunatization Process:  89%|████████▉ | 348/391 [00:37<00:03, 11.38it/s]Qunatization Process:  90%|████████▉ | 350/391 [00:37<00:05,  6.98it/s]Qunatization Process:  90%|█████████ | 352/391 [00:38<00:05,  6.92it/s]Qunatization Process:  91%|█████████ | 356/391 [00:38<00:03, 10.70it/s]Qunatization Process:  92%|█████████▏| 358/391 [00:38<00:03, 10.26it/s]Qunatization Process:  92%|█████████▏| 360/391 [00:38<00:02, 11.53it/s]Qunatization Process:  93%|█████████▎| 362/391 [00:39<00:04,  7.02it/s]Qunatization Process:  93%|█████████▎| 364/391 [00:39<00:03,  6.95it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:39<00:02, 10.77it/s]Qunatization Process:  95%|█████████▍| 370/391 [00:39<00:02, 10.30it/s]Qunatization Process:  95%|█████████▌| 372/391 [00:39<00:01, 11.57it/s]Qunatization Process:  96%|█████████▌| 374/391 [00:40<00:02,  7.03it/s]Qunatization Process:  96%|█████████▌| 376/391 [00:40<00:02,  6.95it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:40<00:01, 10.79it/s]Qunatization Process:  98%|█████████▊| 382/391 [00:41<00:00, 10.29it/s]Qunatization Process:  98%|█████████▊| 384/391 [00:41<00:00, 11.56it/s]Qunatization Process:  99%|█████████▊| 386/391 [00:41<00:00,  7.02it/s]Qunatization Process:  99%|█████████▉| 388/391 [00:42<00:00,  6.94it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  5.04it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  9.10it/s]
+[01:20:07.389777] ## Processing on RANK 7.
+[01:20:54.712152] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/
+[01:21:10.522009] ## Quantizing model to 4bit!
+Qunatization Process:   0%|          | 0/391 [00:00<?, ?it/s]Qunatization Process:   2%|▏         | 8/391 [00:00<00:05, 73.14it/s]Qunatization Process:   4%|▍         | 16/391 [00:01<00:34, 11.02it/s]Qunatization Process:   5%|▌         | 20/391 [00:01<00:26, 14.01it/s]Qunatization Process:   6%|▌         | 24/391 [00:01<00:26, 13.74it/s]Qunatization Process:   7%|▋         | 27/391 [00:02<00:44,  8.25it/s]Qunatization Process:   8%|▊         | 32/391 [00:02<00:30, 11.83it/s]Qunatization Process:   9%|▉         | 35/391 [00:02<00:31, 11.14it/s]Qunatization Process:  10%|▉         | 38/391 [00:03<00:41,  8.55it/s]Qunatization Process:  10%|█         | 40/391 [00:03<00:43,  8.16it/s]Qunatization Process:  11%|█▏        | 44/391 [00:03<00:30, 11.37it/s]Qunatization Process:  12%|█▏        | 47/391 [00:04<00:32, 10.75it/s]Qunatization Process:  13%|█▎        | 49/391 [00:04<00:35,  9.60it/s]Qunatization Process:  13%|█▎        | 51/391 [00:05<00:50,  6.72it/s]Qunatization Process:  14%|█▍        | 56/391 [00:05<00:30, 10.97it/s]Qunatization Process:  15%|█▌        | 59/391 [00:05<00:31, 10.51it/s]Qunatization Process:  16%|█▌        | 61/391 [00:05<00:34,  9.44it/s]Qunatization Process:  16%|█▌        | 63/391 [00:06<00:48,  6.72it/s]Qunatization Process:  17%|█▋        | 68/391 [00:06<00:29, 10.96it/s]Qunatization Process:  18%|█▊        | 71/391 [00:06<00:30, 10.51it/s]Qunatization Process:  19%|█▊        | 73/391 [00:07<00:33,  9.44it/s]Qunatization Process:  19%|█▉        | 75/391 [00:07<00:47,  6.71it/s]Qunatization Process:  20%|██        | 80/391 [00:07<00:28, 10.96it/s]Qunatization Process:  21%|██        | 83/391 [00:08<00:29, 10.50it/s]Qunatization Process:  22%|██▏       | 85/391 [00:08<00:32,  9.44it/s]Qunatization Process:  22%|██▏       | 87/391 [00:09<00:45,  6.67it/s]Qunatization Process:  24%|██▎       | 92/391 [00:09<00:27, 10.86it/s]Qunatization Process:  24%|██▍       | 95/391 [00:09<00:28, 10.44it/s]Qunatization Process:  25%|██▍       | 97/391 [00:09<00:31,  9.40it/s]Qunatization Process:  25%|██▌       | 99/391 [00:10<00:43,  6.70it/s]Qunatization Process:  27%|██▋       | 104/391 [00:10<00:26, 10.94it/s]Qunatization Process:  27%|██▋       | 107/391 [00:10<00:27, 10.47it/s]Qunatization Process:  28%|██▊       | 109/391 [00:11<00:30,  9.35it/s]Qunatization Process:  28%|██▊       | 111/391 [00:11<00:41,  6.69it/s]Qunatization Process:  30%|██▉       | 116/391 [00:11<00:25, 10.92it/s]Qunatization Process:  30%|███       | 119/391 [00:12<00:25, 10.49it/s]Qunatization Process:  31%|███       | 121/391 [00:12<00:28,  9.46it/s]Qunatization Process:  31%|███▏      | 123/391 [00:12<00:39,  6.75it/s]Qunatization Process:  33%|███▎      | 128/391 [00:13<00:23, 10.97it/s]Qunatization Process:  34%|███▎      | 131/391 [00:13<00:24, 10.46it/s]Qunatization Process:  34%|███▍      | 133/391 [00:13<00:27,  9.44it/s]Qunatization Process:  35%|███▍      | 135/391 [00:14<00:37,  6.75it/s]Qunatization Process:  36%|███▌      | 140/391 [00:14<00:22, 11.02it/s]Qunatization Process:  37%|███▋      | 143/391 [00:14<00:23, 10.57it/s]Qunatization Process:  37%|███▋      | 145/391 [00:14<00:25,  9.51it/s]Qunatization Process:  38%|███▊      | 147/391 [00:15<00:36,  6.65it/s]Qunatization Process:  39%|███▉      | 152/391 [00:15<00:22, 10.81it/s]Qunatization Process:  40%|███▉      | 155/391 [00:15<00:23, 10.25it/s]Qunatization Process:  40%|████      | 157/391 [00:16<00:25,  9.16it/s]Qunatization Process:  41%|████      | 159/391 [00:16<00:35,  6.47it/s]Qunatization Process:  42%|████▏     | 164/391 [00:16<00:21, 10.55it/s]Qunatization Process:  43%|████▎     | 167/391 [00:17<00:22, 10.08it/s]Qunatization Process:  43%|████▎     | 169/391 [00:17<00:24,  9.06it/s]Qunatization Process:  44%|████▎     | 171/391 [00:18<00:33,  6.50it/s]Qunatization Process:  45%|████▌     | 176/391 [00:18<00:20, 10.66it/s]Qunatization Process:  46%|████▌     | 179/391 [00:18<00:20, 10.33it/s]Qunatization Process:  46%|████▋     | 181/391 [00:18<00:22,  9.36it/s]Qunatization Process:  47%|████▋     | 183/391 [00:19<00:31,  6.65it/s]Qunatization Process:  48%|████▊     | 188/391 [00:19<00:18, 10.88it/s]Qunatization Process:  49%|████▉     | 191/391 [00:19<00:19, 10.47it/s]Qunatization Process:  49%|████▉     | 193/391 [00:20<00:20,  9.45it/s]Qunatization Process:  50%|████▉     | 195/391 [00:20<00:29,  6.75it/s]Qunatization Process:  51%|█████     | 200/391 [00:20<00:17, 11.03it/s]Qunatization Process:  52%|█████▏    | 203/391 [00:21<00:17, 10.49it/s]Qunatization Process:  52%|█████▏    | 205/391 [00:21<00:19,  9.40it/s]Qunatization Process:  53%|█████▎    | 207/391 [00:22<00:27,  6.73it/s]Qunatization Process:  54%|█████▍    | 212/391 [00:22<00:16, 11.00it/s]Qunatization Process:  55%|█████▍    | 215/391 [00:22<00:16, 10.55it/s]Qunatization Process:  55%|█████▌    | 217/391 [00:22<00:18,  9.48it/s]Qunatization Process:  56%|█████▌    | 219/391 [00:23<00:25,  6.70it/s]Qunatization Process:  57%|█████▋    | 224/391 [00:23<00:15, 10.92it/s]Qunatization Process:  58%|█████▊    | 227/391 [00:23<00:15, 10.50it/s]Qunatization Process:  59%|█████▊    | 229/391 [00:24<00:17,  9.47it/s]Qunatization Process:  59%|█████▉    | 231/391 [00:24<00:23,  6.76it/s]Qunatization Process:  60%|██████    | 236/391 [00:24<00:14, 11.04it/s]Qunatization Process:  61%|██████    | 239/391 [00:25<00:14, 10.58it/s]Qunatization Process:  62%|██████▏   | 241/391 [00:25<00:15,  9.43it/s]Qunatization Process:  62%|██████▏   | 243/391 [00:26<00:21,  6.73it/s]Qunatization Process:  63%|██████▎   | 248/391 [00:26<00:12, 11.00it/s]Qunatization Process:  64%|██████▍   | 251/391 [00:26<00:13, 10.55it/s]Qunatization Process:  65%|██████▍   | 253/391 [00:26<00:14,  9.50it/s]Qunatization Process:  65%|██████▌   | 255/391 [00:27<00:20,  6.77it/s]Qunatization Process:  66%|██████▋   | 260/391 [00:27<00:11, 11.06it/s]Qunatization Process:  67%|██████▋   | 263/391 [00:27<00:12, 10.59it/s]Qunatization Process:  68%|██████▊   | 265/391 [00:28<00:13,  9.52it/s]Qunatization Process:  68%|██████▊   | 267/391 [00:28<00:18,  6.78it/s]Qunatization Process:  70%|██████▉   | 272/391 [00:28<00:10, 11.04it/s]Qunatization Process:  70%|███████   | 275/391 [00:29<00:10, 10.57it/s]Qunatization Process:  71%|███████   | 277/391 [00:29<00:11,  9.50it/s]Qunatization Process:  71%|███████▏  | 279/391 [00:29<00:16,  6.71it/s]Qunatization Process:  73%|███████▎  | 284/391 [00:29<00:09, 10.96it/s]Qunatization Process:  73%|███████▎  | 287/391 [00:30<00:09, 10.52it/s]Qunatization Process:  74%|███████▍  | 289/391 [00:30<00:10,  9.48it/s]Qunatization Process:  74%|███████▍  | 291/391 [00:31<00:14,  6.76it/s]Qunatization Process:  76%|███████▌  | 296/391 [00:31<00:08, 11.04it/s]Qunatization Process:  76%|███████▋  | 299/391 [00:31<00:08, 10.44it/s]Qunatization Process:  77%|███████▋  | 301/391 [00:31<00:09,  9.42it/s]Qunatization Process:  77%|███████▋  | 303/391 [00:32<00:13,  6.74it/s]Qunatization Process:  79%|███████▉  | 308/391 [00:32<00:07, 11.00it/s]Qunatization Process:  80%|███████▉  | 311/391 [00:32<00:07, 10.55it/s]Qunatization Process:  80%|████████  | 313/391 [00:33<00:08,  9.50it/s]Qunatization Process:  81%|████████  | 315/391 [00:33<00:11,  6.71it/s]Qunatization Process:  82%|████████▏ | 320/391 [00:33<00:06, 10.97it/s]Qunatization Process:  83%|████████▎ | 323/391 [00:34<00:06, 10.53it/s]Qunatization Process:  83%|████████▎ | 325/391 [00:34<00:06,  9.49it/s]Qunatization Process:  84%|████████▎ | 327/391 [00:35<00:09,  6.75it/s]Qunatization Process:  85%|████████▍ | 332/391 [00:35<00:05, 11.03it/s]Qunatization Process:  86%|████████▌ | 335/391 [00:35<00:05, 10.52it/s]Qunatization Process:  86%|████████▌ | 337/391 [00:35<00:05,  9.39it/s]Qunatization Process:  87%|████████▋ | 339/391 [00:36<00:07,  6.73it/s]Qunatization Process:  88%|████████▊ | 344/391 [00:36<00:04, 11.00it/s]Qunatization Process:  89%|████████▊ | 347/391 [00:36<00:04, 10.55it/s]Qunatization Process:  89%|████████▉ | 349/391 [00:37<00:04,  9.50it/s]Qunatization Process:  90%|████████▉ | 351/391 [00:37<00:05,  6.77it/s]Qunatization Process:  91%|█████████ | 356/391 [00:37<00:03, 11.06it/s]Qunatization Process:  92%|█████████▏| 359/391 [00:38<00:03, 10.59it/s]Qunatization Process:  92%|█████████▏| 361/391 [00:38<00:03,  9.53it/s]Qunatization Process:  93%|█████████▎| 363/391 [00:38<00:04,  6.78it/s]Qunatization Process:  94%|█████████▍| 368/391 [00:39<00:02, 11.07it/s]Qunatization Process:  95%|█████████▍| 371/391 [00:39<00:01, 10.60it/s]Qunatization Process:  95%|█████████▌| 373/391 [00:39<00:01,  9.49it/s]Qunatization Process:  96%|█████████▌| 375/391 [00:40<00:02,  6.73it/s]Qunatization Process:  97%|█████████▋| 380/391 [00:40<00:01, 11.00it/s]Qunatization Process:  98%|█████████▊| 383/391 [00:40<00:00, 10.52it/s]Qunatization Process:  98%|█████████▊| 385/391 [00:40<00:00,  9.49it/s]Qunatization Process:  99%|█████████▉| 387/391 [00:41<00:00,  6.77it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  5.72it/s]Qunatization Process: 100%|██████████| 391/391 [00:42<00:00,  9.23it/s]
+[01:22:03.134487] Unwrapped Model = MetaModel(
+  (criterion): CrossEntropyLoss()
+  (llma): Transformer(
+    (tok_embeddings): ParallelEmbedding()
+    (layers): ModuleList(
+      (0-31): 32 x TransformerBlock(
+        (attention): Attention(
+          (wq): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wk): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wv): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+          (wo): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+          )
+        )
+        (feed_forward): FeedForward(
+          (w1): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+          )
+          (w2): LoraRowParallelLinear(
+            (quanted_layer): Linear4bit(in_features=11008, out_features=4096, bias=False)
+          )
+          (w3): LoraColumnParallelLinear(
+            (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+          )
+        )
+        (attention_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+        (ffn_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+    (output): ColumnParallelLinear(
+      (quanted_layer): Linear4bit(in_features=4096, out_features=32000, bias=False)
+    )
+  )
+)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 7, which does not have an explicit index. FSDP will use the current device 7. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 0, which does not have an explicit index. FSDP will use the current device 0. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 6, which does not have an explicit index. FSDP will use the current device 6. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 1, which does not have an explicit index. FSDP will use the current device 1. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 5, which does not have an explicit index. FSDP will use the current device 5. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 2, which does not have an explicit index. FSDP will use the current device 2. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 4, which does not have an explicit index. FSDP will use the current device 4. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:715: UserWarning: FSDP got the argument `device_id` cuda on rank 3, which does not have an explicit index. FSDP will use the current device 3. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.
+  warnings.warn(
+[01:22:05.304565] apply gradient checkpointing
+[01:22:05.327638] Model = FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MetaModel(
+    (criterion): CrossEntropyLoss()
+    (llma): Transformer(
+      (tok_embeddings): ParallelEmbedding()
+      (layers): ModuleList(
+        (0-31): 32 x CheckpointWrapper(
+          (_checkpoint_wrapped_module): TransformerBlock(
+            (attention): Attention(
+              (wq): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wk): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wv): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+              (wo): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
+              )
+            )
+            (feed_forward): FeedForward(
+              (w1): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+              )
+              (w2): LoraRowParallelLinear(
+                (quanted_layer): Linear4bit(in_features=11008, out_features=4096, bias=False)
+              )
+              (w3): LoraColumnParallelLinear(
+                (quanted_layer): Linear4bit(in_features=4096, out_features=11008, bias=False)
+              )
+            )
+            (attention_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+            (ffn_norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (norm): FusedRMSNorm(torch.Size([4096]), eps=1e-05, elementwise_affine=True)
+      (output): ColumnParallelLinear(
+        (quanted_layer): Linear4bit(in_features=4096, out_features=32000, bias=False)
+      )
+    )
+  )
+)
+[01:22:05.327674] effective batch size: 64
+[01:22:05.330452] FusedAdam (
+Parameter Group 0
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.0
+
+Parameter Group 1
+    betas: (0.9, 0.95)
+    bias_correction: True
+    eps: 1e-08
+    lr: 5e-05
+    weight_decay: 0.02
+)
+[01:22:05.330574] read dataset config from configs/data/finetune/sg/alpaca.yaml
+[01:22:05.331273] DATASET CONFIG:
+[01:22:05.331288] {'META': [['../data/alpaca_gpt4_data.json', 'text']]}
+[01:22:05.483548] ../data/alpaca_gpt4_data.json, typetext: len 52002
+[01:22:05.485555] total length: 52002
+[01:22:05.497238] <data.alpaca.FinetuneDataset object at 0x7fcd0b286590>
+[01:22:05.499269] Start training for 4 epochs
+[01:22:05.503408] log_dir: ./output_dir
+[01:22:09.419465] Epoch: [0]  [0/812]  lr: 0.000000  grad_norm: 1.5851 (1.5851)  closs: 1.1100 (1.1100)  time: 3.9149  data: 1.4914  max mem: 11698
+[01:22:20.379352] Epoch: [0]  [10/812]  lr: 0.000001  grad_norm: 1.6298 (1.5954)  closs: 1.1139 (1.1233)  time: 1.3522  data: 0.1358  max mem: 17563
+[01:22:31.537117] Epoch: [0]  [20/812]  lr: 0.000001  grad_norm: 1.5263 (1.5275)  closs: 1.0717 (1.1034)  time: 1.1058  data: 0.0002  max mem: 17563
+[01:22:42.269605] Epoch: [0]  [30/812]  lr: 0.000002  grad_norm: 1.5396 (1.5436)  closs: 1.1138 (1.1221)  time: 1.0944  data: 0.0002  max mem: 17563
+[01:22:53.398607] Epoch: [0]  [40/812]  lr: 0.000002  grad_norm: 1.5046 (1.5073)  closs: 1.1306 (1.1303)  time: 1.0930  data: 0.0002  max mem: 17563
+[01:23:04.182090] Epoch: [0]  [50/812]  lr: 0.000003  grad_norm: 1.3218 (1.4667)  closs: 1.1306 (1.1331)  time: 1.0956  data: 0.0002  max mem: 17563
+[01:23:14.952358] Epoch: [0]  [60/812]  lr: 0.000004  grad_norm: 1.2240 (1.4199)  closs: 1.1418 (1.1299)  time: 1.0776  data: 0.0002  max mem: 17563
+[01:23:25.936660] Epoch: [0]  [70/812]  lr: 0.000004  grad_norm: 1.1201 (1.3708)  closs: 1.0912 (1.1343)  time: 1.0877  data: 0.0002  max mem: 17563
+[01:23:36.708068] Epoch: [0]  [80/812]  lr: 0.000005  grad_norm: 1.0076 (1.3227)  closs: 1.0998 (1.1334)  time: 1.0877  data: 0.0002  max mem: 17563
+[01:23:47.591464] Epoch: [0]  [90/812]  lr: 0.000006  grad_norm: 0.9119 (1.2713)  closs: 1.0616 (1.1231)  time: 1.0827  data: 0.0002  max mem: 17563
+[01:23:58.446457] Epoch: [0]  [100/812]  lr: 0.000006  grad_norm: 0.7848 (1.2187)  closs: 1.0525 (1.1199)  time: 1.0868  data: 0.0002  max mem: 17563
+[01:24:09.227248] Epoch: [0]  [110/812]  lr: 0.000007  grad_norm: 0.6812 (1.1715)  closs: 1.0394 (1.1090)  time: 1.0817  data: 0.0002  max mem: 17563
+[01:24:20.398593] Epoch: [0]  [120/812]  lr: 0.000007  grad_norm: 0.6384 (1.1269)  closs: 1.0384 (1.1100)  time: 1.0975  data: 0.0002  max mem: 17563
+[01:24:31.192852] Epoch: [0]  [130/812]  lr: 0.000008  grad_norm: 0.6113 (1.0871)  closs: 1.0841 (1.1071)  time: 1.0982  data: 0.0002  max mem: 17563
+[01:24:41.965595] Epoch: [0]  [140/812]  lr: 0.000009  grad_norm: 0.5842 (1.0530)  closs: 0.9775 (1.0988)  time: 1.0783  data: 0.0002  max mem: 17563
+[01:24:52.911946] Epoch: [0]  [150/812]  lr: 0.000009  grad_norm: 0.6050 (1.0261)  closs: 0.9885 (1.0948)  time: 1.0859  data: 0.0002  max mem: 17563
+[01:25:03.677533] Epoch: [0]  [160/812]  lr: 0.000010  grad_norm: 0.6036 (1.0009)  closs: 1.0495 (1.0920)  time: 1.0855  data: 0.0002  max mem: 17563
+[01:25:14.583376] Epoch: [0]  [170/812]  lr: 0.000010  grad_norm: 0.6036 (0.9806)  closs: 0.9975 (1.0849)  time: 1.0835  data: 0.0002  max mem: 17563
+[01:25:25.336144] Epoch: [0]  [180/812]  lr: 0.000011  grad_norm: 0.5856 (0.9554)  closs: 0.9262 (1.0760)  time: 1.0829  data: 0.0002  max mem: 17563
+[01:25:36.082435] Epoch: [0]  [190/812]  lr: 0.000012  grad_norm: 0.4878 (0.9305)  closs: 0.9326 (1.0696)  time: 1.0749  data: 0.0002  max mem: 17563
+[01:25:47.041738] Epoch: [0]  [200/812]  lr: 0.000012  grad_norm: 0.4558 (0.9080)  closs: 0.9832 (1.0663)  time: 1.0852  data: 0.0002  max mem: 17563
+[01:25:57.816030] Epoch: [0]  [210/812]  lr: 0.000013  grad_norm: 0.4558 (0.8868)  closs: 0.9928 (1.0618)  time: 1.0866  data: 0.0002  max mem: 17563
+[01:26:08.707981] Epoch: [0]  [220/812]  lr: 0.000014  grad_norm: 0.4294 (0.8654)  closs: 0.9756 (1.0584)  time: 1.0832  data: 0.0002  max mem: 17563
+[01:26:19.569916] Epoch: [0]  [230/812]  lr: 0.000014  grad_norm: 0.4231 (0.8473)  closs: 0.9284 (1.0518)  time: 1.0876  data: 0.0002  max mem: 17563
+[01:26:30.341740] Epoch: [0]  [240/812]  lr: 0.000015  grad_norm: 0.4198 (0.8302)  closs: 0.9145 (1.0479)  time: 1.0816  data: 0.0002  max mem: 17563
+[01:26:41.347965] Epoch: [0]  [250/812]  lr: 0.000015  grad_norm: 0.4445 (0.8150)  closs: 0.9541 (1.0449)  time: 1.0888  data: 0.0002  max mem: 17563
+[01:26:52.141904] Epoch: [0]  [260/812]  lr: 0.000016  grad_norm: 0.3947 (0.7985)  closs: 0.9527 (1.0411)  time: 1.0899  data: 0.0002  max mem: 17563
+[01:27:02.920099] Epoch: [0]  [270/812]  lr: 0.000017  grad_norm: 0.3933 (0.7847)  closs: 0.9685 (1.0398)  time: 1.0785  data: 0.0002  max mem: 17563
+[01:27:13.865000] Epoch: [0]  [280/812]  lr: 0.000017  grad_norm: 0.3972 (0.7720)  closs: 0.9685 (1.0365)  time: 1.0861  data: 0.0002  max mem: 17563
+[01:27:24.627033] Epoch: [0]  [290/812]  lr: 0.000018  grad_norm: 0.3972 (0.7602)  closs: 0.9590 (1.0331)  time: 1.0853  data: 0.0002  max mem: 17563
+[01:27:35.600123] Epoch: [0]  [300/812]  lr: 0.000018  grad_norm: 0.4177 (0.7492)  closs: 0.9565 (1.0304)  time: 1.0867  data: 0.0002  max mem: 17563
+[01:27:46.357189] Epoch: [0]  [310/812]  lr: 0.000019  grad_norm: 0.4166 (0.7387)  closs: 0.9678 (1.0295)  time: 1.0864  data: 0.0002  max mem: 17563
+[01:27:57.128434] Epoch: [0]  [320/812]  lr: 0.000020  grad_norm: 0.4347 (0.7308)  closs: 0.9678 (1.0290)  time: 1.0763  data: 0.0002  max mem: 17563
+[01:28:08.112145] Epoch: [0]  [330/812]  lr: 0.000020  grad_norm: 0.4021 (0.7216)  closs: 0.9325 (1.0248)  time: 1.0877  data: 0.0002  max mem: 17563
+[01:28:18.879864] Epoch: [0]  [340/812]  lr: 0.000021  grad_norm: 0.4021 (0.7125)  closs: 0.9178 (1.0215)  time: 1.0875  data: 0.0002  max mem: 17563
+[01:28:29.776770] Epoch: [0]  [350/812]  lr: 0.000022  grad_norm: 0.3938 (0.7043)  closs: 0.9432 (1.0223)  time: 1.0832  data: 0.0002  max mem: 17563
+[01:28:40.599074] Epoch: [0]  [360/812]  lr: 0.000022  grad_norm: 0.4132 (0.6974)  closs: 0.9773 (1.0202)  time: 1.0859  data: 0.0002  max mem: 17563
+[01:28:51.342835] Epoch: [0]  [370/812]  lr: 0.000023  grad_norm: 0.4203 (0.6895)  closs: 0.9131 (1.0175)  time: 1.0782  data: 0.0002  max mem: 17563
+[01:29:02.284783] Epoch: [0]  [380/812]  lr: 0.000023  grad_norm: 0.3938 (0.6827)  closs: 0.9131 (1.0160)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:29:13.036230] Epoch: [0]  [390/812]  lr: 0.000024  grad_norm: 0.4150 (0.6784)  closs: 0.9211 (1.0143)  time: 1.0846  data: 0.0002  max mem: 17563
+[01:29:23.775505] Epoch: [0]  [400/812]  lr: 0.000025  grad_norm: 0.4409 (0.6724)  closs: 0.9129 (1.0127)  time: 1.0745  data: 0.0002  max mem: 17563
+[01:29:34.715185] Epoch: [0]  [410/812]  lr: 0.000025  grad_norm: 0.4361 (0.6671)  closs: 0.9129 (1.0112)  time: 1.0839  data: 0.0002  max mem: 17563
+[01:29:45.495129] Epoch: [0]  [420/812]  lr: 0.000026  grad_norm: 0.4243 (0.6609)  closs: 0.8944 (1.0089)  time: 1.0859  data: 0.0002  max mem: 17563
+[01:29:56.420098] Epoch: [0]  [430/812]  lr: 0.000026  grad_norm: 0.4060 (0.6558)  closs: 0.8753 (1.0060)  time: 1.0852  data: 0.0002  max mem: 17563
+[01:30:07.187456] Epoch: [0]  [440/812]  lr: 0.000027  grad_norm: 0.4138 (0.6505)  closs: 0.8546 (1.0032)  time: 1.0845  data: 0.0002  max mem: 17563
+[01:30:17.943263] Epoch: [0]  [450/812]  lr: 0.000028  grad_norm: 0.4181 (0.6457)  closs: 0.8741 (1.0003)  time: 1.0761  data: 0.0002  max mem: 17563
+[01:30:28.879556] Epoch: [0]  [460/812]  lr: 0.000028  grad_norm: 0.4111 (0.6407)  closs: 0.9011 (0.9989)  time: 1.0845  data: 0.0002  max mem: 17563
+[01:30:39.634616] Epoch: [0]  [470/812]  lr: 0.000029  grad_norm: 0.4111 (0.6368)  closs: 0.9094 (0.9983)  time: 1.0845  data: 0.0002  max mem: 17563
+[01:30:50.485964] Epoch: [0]  [480/812]  lr: 0.000030  grad_norm: 0.4260 (0.6326)  closs: 0.9646 (0.9977)  time: 1.0803  data: 0.0002  max mem: 17563
+[01:31:01.334825] Epoch: [0]  [490/812]  lr: 0.000030  grad_norm: 0.4045 (0.6274)  closs: 0.9550 (0.9968)  time: 1.0849  data: 0.0002  max mem: 17563
+[01:31:12.096791] Epoch: [0]  [500/812]  lr: 0.000031  grad_norm: 0.3751 (0.6228)  closs: 0.9518 (0.9954)  time: 1.0805  data: 0.0002  max mem: 17563
+[01:31:23.034985] Epoch: [0]  [510/812]  lr: 0.000031  grad_norm: 0.4087 (0.6203)  closs: 0.9159 (0.9931)  time: 1.0849  data: 0.0002  max mem: 17563
+[01:31:33.781764] Epoch: [0]  [520/812]  lr: 0.000032  grad_norm: 0.4179 (0.6162)  closs: 0.8666 (0.9919)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:31:44.525309] Epoch: [0]  [530/812]  lr: 0.000033  grad_norm: 0.4241 (0.6133)  closs: 0.9274 (0.9909)  time: 1.0744  data: 0.0002  max mem: 17563
+[01:31:55.482190] Epoch: [0]  [540/812]  lr: 0.000033  grad_norm: 0.4228 (0.6100)  closs: 0.9681 (0.9901)  time: 1.0849  data: 0.0002  max mem: 17563
+[01:32:06.249000] Epoch: [0]  [550/812]  lr: 0.000034  grad_norm: 0.4076 (0.6068)  closs: 0.9330 (0.9886)  time: 1.0861  data: 0.0002  max mem: 17563
+[01:32:17.186588] Epoch: [0]  [560/812]  lr: 0.000034  grad_norm: 0.4216 (0.6043)  closs: 0.8816 (0.9879)  time: 1.0851  data: 0.0002  max mem: 17563
+[01:32:27.934998] Epoch: [0]  [570/812]  lr: 0.000035  grad_norm: 0.4354 (0.6016)  closs: 0.9062 (0.9869)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:32:38.690637] Epoch: [0]  [580/812]  lr: 0.000036  grad_norm: 0.4128 (0.5987)  closs: 0.9111 (0.9867)  time: 1.0751  data: 0.0002  max mem: 17563
+[01:32:49.647407] Epoch: [0]  [590/812]  lr: 0.000036  grad_norm: 0.4128 (0.5956)  closs: 0.9117 (0.9857)  time: 1.0855  data: 0.0002  max mem: 17563
+[01:33:00.403073] Epoch: [0]  [600/812]  lr: 0.000037  grad_norm: 0.4093 (0.5925)  closs: 0.9308 (0.9848)  time: 1.0856  data: 0.0002  max mem: 17563
+[01:33:11.264501] Epoch: [0]  [610/812]  lr: 0.000038  grad_norm: 0.4305 (0.5901)  closs: 0.8741 (0.9826)  time: 1.0808  data: 0.0003  max mem: 17563
+[01:33:22.096122] Epoch: [0]  [620/812]  lr: 0.000038  grad_norm: 0.4412 (0.5876)  closs: 0.8519 (0.9818)  time: 1.0846  data: 0.0003  max mem: 17563
+[01:33:32.848945] Epoch: [0]  [630/812]  lr: 0.000039  grad_norm: 0.4384 (0.5857)  closs: 0.9008 (0.9806)  time: 1.0792  data: 0.0002  max mem: 17563
+[01:33:43.792068] Epoch: [0]  [640/812]  lr: 0.000039  grad_norm: 0.4455 (0.5836)  closs: 0.9237 (0.9800)  time: 1.0847  data: 0.0002  max mem: 17563
+[01:33:54.556290] Epoch: [0]  [650/812]  lr: 0.000040  grad_norm: 0.4246 (0.5814)  closs: 0.9106 (0.9786)  time: 1.0853  data: 0.0002  max mem: 17563
+[01:34:05.298115] Epoch: [0]  [660/812]  lr: 0.000041  grad_norm: 0.4145 (0.5796)  closs: 0.8797 (0.9780)  time: 1.0752  data: 0.0002  max mem: 17563
+[01:34:16.274488] Epoch: [0]  [670/812]  lr: 0.000041  grad_norm: 0.3915 (0.5766)  closs: 0.8957 (0.9773)  time: 1.0858  data: 0.0002  max mem: 17563
+[01:34:27.009081] Epoch: [0]  [680/812]  lr: 0.000042  grad_norm: 0.3898 (0.5742)  closs: 0.9229 (0.9768)  time: 1.0855  data: 0.0002  max mem: 17563
+[01:34:37.956230] Epoch: [0]  [690/812]  lr: 0.000042  grad_norm: 0.4105 (0.5721)  closs: 0.9277 (0.9763)  time: 1.0840  data: 0.0002  max mem: 17563
+[01:34:48.711827] Epoch: [0]  [700/812]  lr: 0.000043  grad_norm: 0.4030 (0.5699)  closs: 0.9277 (0.9760)  time: 1.0851  data: 0.0002  max mem: 17563
+[01:34:59.483778] Epoch: [0]  [710/812]  lr: 0.000044  grad_norm: 0.3941 (0.5679)  closs: 0.8992 (0.9747)  time: 1.0763  data: 0.0002  max mem: 17563
+[01:35:10.426566] Epoch: [0]  [720/812]  lr: 0.000044  grad_norm: 0.4368 (0.5662)  closs: 0.8889 (0.9741)  time: 1.0857  data: 0.0002  max mem: 17563
+[01:35:21.173421] Epoch: [0]  [730/812]  lr: 0.000045  grad_norm: 0.4336 (0.5642)  closs: 0.9001 (0.9732)  time: 1.0844  data: 0.0002  max mem: 17563
+[01:35:32.063229] Epoch: [0]  [740/812]  lr: 0.000046  grad_norm: 0.4336 (0.5623)  closs: 0.8885 (0.9718)  time: 1.0818  data: 0.0002  max mem: 17563
+[01:35:42.886626] Epoch: [0]  [750/812]  lr: 0.000046  grad_norm: 0.4241 (0.5608)  closs: 0.9257 (0.9726)  time: 1.0856  data: 0.0002  max mem: 17563
+[01:35:53.664183] Epoch: [0]  [760/812]  lr: 0.000047  grad_norm: 0.4208 (0.5596)  closs: 0.9473 (0.9717)  time: 1.0800  data: 0.0002  max mem: 17563
+[01:36:04.654574] Epoch: [0]  [770/812]  lr: 0.000047  grad_norm: 0.4265 (0.5578)  closs: 0.9173 (0.9707)  time: 1.0883  data: 0.0002  max mem: 17563
+[01:36:15.394778] Epoch: [0]  [780/812]  lr: 0.000048  grad_norm: 0.4275 (0.5565)  closs: 0.9173 (0.9703)  time: 1.0865  data: 0.0002  max mem: 17563
+[01:36:26.176020] Epoch: [0]  [790/812]  lr: 0.000049  grad_norm: 0.4331 (0.5550)  closs: 0.8900 (0.9693)  time: 1.0760  data: 0.0002  max mem: 17563
+[01:36:37.110017] Epoch: [0]  [800/812]  lr: 0.000049  grad_norm: 0.4156 (0.5532)  closs: 0.8900 (0.9687)  time: 1.0857  data: 0.0002  max mem: 17563
+[01:36:47.887354] Epoch: [0]  [810/812]  lr: 0.000050  grad_norm: 0.3976 (0.5518)  closs: 0.8777 (0.9673)  time: 1.0855  data: 0.0002  max mem: 17563
+[01:36:49.163189] Epoch: [0] Total time: 0:14:43
+[01:36:49.176568] Averaged stats: lr: 0.000050  grad_norm: 0.3924 (0.5516)  closs: 0.8777 (0.9724)
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[01:36:49.242844] model saved
+/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2.
+  warnings.warn(
+[01:36:49.850475] optimizer saved
+[01:36:49.850895] other rank-common saved
+[01:36:49.852471] rank-specific saved
+[01:36:49.856696] log_dir: ./output_dir
+[01:36:52.189948] Epoch: [1]  [0/812]  lr: 0.000050  grad_norm: 0.4764 (0.4764)  closs: 0.7142 (0.7142)  time: 2.3324  data: 1.2396  max mem: 17563
+[01:37:03.067048] Epoch: [1]  [10/812]  lr: 0.000050  grad_norm: 0.4408 (0.4522)  closs: 0.8549 (0.8482)  time: 1.2008  data: 0.1129  max mem: 17563
+[01:37:13.868785] Epoch: [1]  [20/812]  lr: 0.000050  grad_norm: 0.4408 (0.4571)  closs: 0.8781 (0.9186)  time: 1.0839  data: 0.0002  max mem: 17563
+[01:37:24.639698] Epoch: [1]  [30/812]  lr: 0.000050  grad_norm: 0.4399 (0.4536)  closs: 0.9031 (0.9141)  time: 1.0786  data: 0.0002  max mem: 17563
+[01:37:35.491675] Epoch: [1]  [40/812]  lr: 0.000050  grad_norm: 0.4093 (0.4483)  closs: 0.9425 (0.9326)  time: 1.0811  data: 0.0002  max mem: 17563
+[01:37:46.229510] Epoch: [1]  [50/812]  lr: 0.000050  grad_norm: 0.3888 (0.4407)  closs: 0.9432 (0.9284)  time: 1.0794  data: 0.0002  max mem: 17563
+[01:37:56.982923] Epoch: [1]  [60/812]  lr: 0.000050  grad_norm: 0.3976 (0.4343)  closs: 0.9081 (0.9261)  time: 1.0745  data: 0.0002  max mem: 17563
+[01:38:07.884887] Epoch: [1]  [70/812]  lr: 0.000050  grad_norm: 0.3992 (0.4349)  closs: 0.9186 (0.9288)  time: 1.0827  data: 0.0002  max mem: 17563
+[01:38:18.645889] Epoch: [1]  [80/812]  lr: 0.000050  grad_norm: 0.4137 (0.4339)  closs: 0.9349 (0.9248)  time: 1.0831  data: 0.0002  max mem: 17563
+[01:38:29.481375] Epoch: [1]  [90/812]  lr: 0.000050  grad_norm: 0.3875 (0.4270)  closs: 0.9349 (0.9295)  time: 1.0797  data: 0.0002  max mem: 17563
+[01:38:40.338975] Epoch: [1]  [100/812]  lr: 0.000050  grad_norm: 0.3933 (0.4267)  closs: 0.9616 (0.9318)  time: 1.0846  data: 0.0002  max mem: 17563
+[01:38:51.268582] Epoch: [1]  [110/812]  lr: 0.000050  grad_norm: 0.4030 (0.4267)  closs: 0.9432 (0.9282)  time: 1.0893  data: 0.0002  max mem: 17563
+[01:39:02.289923] Epoch: [1]  [120/812]  lr: 0.000050  grad_norm: 0.4101 (0.4278)  closs: 0.9432 (0.9294)  time: 1.0975  data: 0.0002  max mem: 17563
+[01:39:13.138448] Epoch: [1]  [130/812]  lr: 0.000050  grad_norm: 0.4101 (0.4280)  closs: 0.9284 (0.9292)  time: 1.0934  data: 0.0002  max mem: 17563
+[01:39:24.044720] Epoch: [1]  [140/812]  lr: 0.000050  grad_norm: 0.4075 (0.4287)  closs: 0.9221 (0.9280)  time: 1.0877  data: 0.0002  max mem: 17563
+[01:39:34.896000] Epoch: [1]  [150/812]  lr: 0.000050  grad_norm: 0.4166 (0.4289)  closs: 0.9403 (0.9278)  time: 1.0878  data: 0.0002  max mem: 17563
+[01:39:45.869857] Epoch: [1]  [160/812]  lr: 0.000050  grad_norm: 0.4266 (0.4286)  closs: 0.9551 (0.9301)  time: 1.0912  data: 0.0002  max mem: 17563
+[01:39:56.708956] Epoch: [1]  [170/812]  lr: 0.000049  grad_norm: 0.4038 (0.4293)  closs: 0.9698 (0.9322)  time: 1.0906  data: 0.0002  max mem: 17563
+[01:40:07.568446] Epoch: [1]  [180/812]  lr: 0.000049  grad_norm: 0.3849 (0.4277)  closs: 0.9374 (0.9326)  time: 1.0849  data: 0.0003  max mem: 17563
+[01:40:18.500307] Epoch: [1]  [190/812]  lr: 0.000049  grad_norm: 0.3905 (0.4259)  closs: 0.8962 (0.9308)  time: 1.0895  data: 0.0003  max mem: 17563
+[01:40:29.409889] Epoch: [1]  [200/812]  lr: 0.000049  grad_norm: 0.3924 (0.4243)  closs: 0.9011 (0.9293)  time: 1.0920  data: 0.0002  max mem: 17563
+[01:40:40.336146] Epoch: [1]  [210/812]  lr: 0.000049  grad_norm: 0.3986 (0.4233)  closs: 0.9540 (0.9315)  time: 1.0917  data: 0.0002  max mem: 17563
+[01:40:51.171698] Epoch: [1]  [220/812]  lr: 0.000049  grad_norm: 0.3885 (0.4233)  closs: 0.9565 (0.9301)  time: 1.0880  data: 0.0002  max mem: 17563
+[01:41:02.091692] Epoch: [1]  [230/812]  lr: 0.000049  grad_norm: 0.3872 (0.4221)  closs: 0.8715 (0.9287)  time: 1.0877  data: 0.0002  max mem: 17563
+[01:41:13.045652] Epoch: [1]  [240/812]  lr: 0.000049  grad_norm: 0.3892 (0.4213)  closs: 0.9001 (0.9298)  time: 1.0936  data: 0.0002  max mem: 17563
+[01:41:24.014771] Epoch: [1]  [250/812]  lr: 0.000049  grad_norm: 0.4036 (0.4215)  closs: 0.9496 (0.9289)  time: 1.0961  data: 0.0002  max mem: 17563
+[01:41:34.910457] Epoch: [1]  [260/812]  lr: 0.000049  grad_norm: 0.4114 (0.4219)  closs: 0.9459 (0.9293)  time: 1.0932  data: 0.0002  max mem: 17563
+[01:41:45.860800] Epoch: [1]  [270/812]  lr: 0.000049  grad_norm: 0.4219 (0.4221)  closs: 0.9364 (0.9278)  time: 1.0922  data: 0.0002  max mem: 17563
+[01:41:56.732678] Epoch: [1]  [280/812]  lr: 0.000049  grad_norm: 0.4219 (0.4224)  closs: 0.9236 (0.9279)  time: 1.0910  data: 0.0002  max mem: 17563
+[01:42:07.667184] Epoch: [1]  [290/812]  lr: 0.000048  grad_norm: 0.4094 (0.4220)  closs: 0.9319 (0.9285)  time: 1.0902  data: 0.0002  max mem: 17563
+[01:42:18.499974] Epoch: [1]  [300/812]  lr: 0.000048  grad_norm: 0.4094 (0.4215)  closs: 0.9029 (0.9271)  time: 1.0883  data: 0.0002  max mem: 17563
+[01:42:29.390912] Epoch: [1]  [310/812]  lr: 0.000048  grad_norm: 0.4156 (0.4225)  closs: 0.8847 (0.9262)  time: 1.0861  data: 0.0002  max mem: 17563
+[01:42:40.312982] Epoch: [1]  [320/812]  lr: 0.000048  grad_norm: 0.4155 (0.4227)  closs: 0.8956 (0.9267)  time: 1.0906  data: 0.0002  max mem: 17563
+[01:42:51.239384] Epoch: [1]  [330/812]  lr: 0.000048  grad_norm: 0.3998 (0.4223)  closs: 0.8956 (0.9259)  time: 1.0923  data: 0.0002  max mem: 17563
+[01:43:02.180532] Epoch: [1]  [340/812]  lr: 0.000048  grad_norm: 0.4095 (0.4231)  closs: 0.8600 (0.9239)  time: 1.0933  data: 0.0002  max mem: 17563
+[01:43:13.068593] Epoch: [1]  [350/812]  lr: 0.000048  grad_norm: 0.4290 (0.4233)  closs: 0.8778 (0.9234)  time: 1.0914  data: 0.0002  max mem: 17563
+[01:43:23.907152] Epoch: [1]  [360/812]  lr: 0.000048  grad_norm: 0.4217 (0.4236)  closs: 0.8525 (0.9212)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:43:34.848401] Epoch: [1]  [370/812]  lr: 0.000047  grad_norm: 0.4361 (0.4246)  closs: 0.8350 (0.9202)  time: 1.0889  data: 0.0002  max mem: 17563
+[01:43:45.784776] Epoch: [1]  [380/812]  lr: 0.000047  grad_norm: 0.4361 (0.4240)  closs: 0.8984 (0.9204)  time: 1.0938  data: 0.0002  max mem: 17563
+[01:43:56.644693] Epoch: [1]  [390/812]  lr: 0.000047  grad_norm: 0.4039 (0.4239)  closs: 0.9187 (0.9209)  time: 1.0897  data: 0.0002  max mem: 17563
+[01:44:07.528074] Epoch: [1]  [400/812]  lr: 0.000047  grad_norm: 0.4019 (0.4235)  closs: 0.9184 (0.9206)  time: 1.0871  data: 0.0002  max mem: 17563
+[01:44:18.390371] Epoch: [1]  [410/812]  lr: 0.000047  grad_norm: 0.4019 (0.4239)  closs: 0.9184 (0.9220)  time: 1.0872  data: 0.0002  max mem: 17563
+[01:44:29.324907] Epoch: [1]  [420/812]  lr: 0.000047  grad_norm: 0.4286 (0.4237)  closs: 0.9624 (0.9224)  time: 1.0898  data: 0.0002  max mem: 17563
+[01:44:40.135621] Epoch: [1]  [430/812]  lr: 0.000047  grad_norm: 0.4213 (0.4255)  closs: 0.9003 (0.9216)  time: 1.0872  data: 0.0002  max mem: 17563
+[01:44:50.992615] Epoch: [1]  [440/812]  lr: 0.000046  grad_norm: 0.4268 (0.4257)  closs: 0.9141 (0.9217)  time: 1.0833  data: 0.0002  max mem: 17563
+[01:45:01.952874] Epoch: [1]  [450/812]  lr: 0.000046  grad_norm: 0.4268 (0.4260)  closs: 0.9244 (0.9213)  time: 1.0908  data: 0.0002  max mem: 17563
+[01:45:12.875704] Epoch: [1]  [460/812]  lr: 0.000046  grad_norm: 0.4164 (0.4255)  closs: 0.8638 (0.9207)  time: 1.0941  data: 0.0002  max mem: 17563
+[01:45:23.810878] Epoch: [1]  [470/812]  lr: 0.000046  grad_norm: 0.4022 (0.4252)  closs: 0.8908 (0.9207)  time: 1.0928  data: 0.0002  max mem: 17563
+[01:45:34.685211] Epoch: [1]  [480/812]  lr: 0.000046  grad_norm: 0.4066 (0.4260)  closs: 0.9035 (0.9213)  time: 1.0904  data: 0.0002  max mem: 17563
+[01:45:45.553666] Epoch: [1]  [490/812]  lr: 0.000046  grad_norm: 0.4311 (0.4269)  closs: 0.9485 (0.9221)  time: 1.0871  data: 0.0002  max mem: 17563
+[01:45:56.494991] Epoch: [1]  [500/812]  lr: 0.000045  grad_norm: 0.4354 (0.4271)  closs: 0.9109 (0.9218)  time: 1.0904  data: 0.0002  max mem: 17563
+[01:46:07.415542] Epoch: [1]  [510/812]  lr: 0.000045  grad_norm: 0.3986 (0.4267)  closs: 0.8969 (0.9214)  time: 1.0930  data: 0.0002  max mem: 17563
+[01:46:18.280334] Epoch: [1]  [520/812]  lr: 0.000045  grad_norm: 0.4179 (0.4272)  closs: 0.9120 (0.9214)  time: 1.0892  data: 0.0002  max mem: 17563
+[01:46:29.242265] Epoch: [1]  [530/812]  lr: 0.000045  grad_norm: 0.4093 (0.4266)  closs: 0.9045 (0.9215)  time: 1.0913  data: 0.0002  max mem: 17563
+[01:46:40.097372] Epoch: [1]  [540/812]  lr: 0.000045  grad_norm: 0.4093 (0.4266)  closs: 0.8894 (0.9216)  time: 1.0908  data: 0.0002  max mem: 17563
+[01:46:51.082535] Epoch: [1]  [550/812]  lr: 0.000045  grad_norm: 0.3950 (0.4261)  closs: 0.8833 (0.9210)  time: 1.0919  data: 0.0002  max mem: 17563
+[01:47:01.956871] Epoch: [1]  [560/812]  lr: 0.000044  grad_norm: 0.3946 (0.4259)  closs: 0.9061 (0.9212)  time: 1.0929  data: 0.0002  max mem: 17563
+[01:47:12.829557] Epoch: [1]  [570/812]  lr: 0.000044  grad_norm: 0.4047 (0.4257)  closs: 0.9278 (0.9216)  time: 1.0873  data: 0.0002  max mem: 17563
+[01:47:23.771261] Epoch: [1]  [580/812]  lr: 0.000044  grad_norm: 0.4115 (0.4255)  closs: 0.9212 (0.9207)  time: 1.0906  data: 0.0002  max mem: 17563
+[01:47:34.689636] Epoch: [1]  [590/812]  lr: 0.000044  grad_norm: 0.4115 (0.4260)  closs: 0.9243 (0.9210)  time: 1.0929  data: 0.0002  max mem: 17563
+[01:47:45.757877] Epoch: [1]  [600/812]  lr: 0.000044  grad_norm: 0.3990 (0.4256)  closs: 0.9293 (0.9212)  time: 1.0993  data: 0.0002  max mem: 17563
+[01:47:56.585342] Epoch: [1]  [610/812]  lr: 0.000043  grad_norm: 0.3955 (0.4253)  closs: 0.9080 (0.9213)  time: 1.0947  data: 0.0002  max mem: 17563
+[01:48:07.414290] Epoch: [1]  [620/812]  lr: 0.000043  grad_norm: 0.4127 (0.4249)  closs: 0.9134 (0.9214)  time: 1.0828  data: 0.0002  max mem: 17563
+[01:48:18.313386] Epoch: [1]  [630/812]  lr: 0.000043  grad_norm: 0.3864 (0.4243)  closs: 0.9365 (0.9217)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:48:29.236580] Epoch: [1]  [640/812]  lr: 0.000043  grad_norm: 0.3864 (0.4241)  closs: 0.9075 (0.9217)  time: 1.0910  data: 0.0002  max mem: 17563
+[01:48:40.090161] Epoch: [1]  [650/812]  lr: 0.000043  grad_norm: 0.3873 (0.4239)  closs: 0.8929 (0.9215)  time: 1.0888  data: 0.0002  max mem: 17563
+[01:48:51.000883] Epoch: [1]  [660/812]  lr: 0.000042  grad_norm: 0.3823 (0.4236)  closs: 0.9560 (0.9225)  time: 1.0881  data: 0.0002  max mem: 17563
+[01:49:01.838795] Epoch: [1]  [670/812]  lr: 0.000042  grad_norm: 0.3860 (0.4235)  closs: 0.9866 (0.9232)  time: 1.0874  data: 0.0002  max mem: 17563
+[01:49:12.727690] Epoch: [1]  [680/812]  lr: 0.000042  grad_norm: 0.3983 (0.4238)  closs: 0.9759 (0.9238)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:49:23.555512] Epoch: [1]  [690/812]  lr: 0.000042  grad_norm: 0.3988 (0.4241)  closs: 0.9759 (0.9243)  time: 1.0858  data: 0.0002  max mem: 17563
+[01:49:34.420950] Epoch: [1]  [700/812]  lr: 0.000041  grad_norm: 0.4054 (0.4239)  closs: 0.9389 (0.9243)  time: 1.0846  data: 0.0002  max mem: 17563
+[01:49:45.387096] Epoch: [1]  [710/812]  lr: 0.000041  grad_norm: 0.4068 (0.4237)  closs: 0.8852 (0.9239)  time: 1.0915  data: 0.0002  max mem: 17563
+[01:49:56.367505] Epoch: [1]  [720/812]  lr: 0.000041  grad_norm: 0.4067 (0.4235)  closs: 0.8749 (0.9235)  time: 1.0972  data: 0.0004  max mem: 17563
+[01:50:07.313072] Epoch: [1]  [730/812]  lr: 0.000041  grad_norm: 0.4023 (0.4233)  closs: 0.9136 (0.9235)  time: 1.0962  data: 0.0003  max mem: 17563
+[01:50:18.143294] Epoch: [1]  [740/812]  lr: 0.000041  grad_norm: 0.4113 (0.4235)  closs: 0.9239 (0.9234)  time: 1.0887  data: 0.0002  max mem: 17563
+[01:50:28.998868] Epoch: [1]  [750/812]  lr: 0.000040  grad_norm: 0.4113 (0.4232)  closs: 0.8806 (0.9231)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:50:39.933717] Epoch: [1]  [760/812]  lr: 0.000040  grad_norm: 0.3932 (0.4227)  closs: 0.8906 (0.9235)  time: 1.0895  data: 0.0002  max mem: 17563
+[01:50:50.842088] Epoch: [1]  [770/812]  lr: 0.000040  grad_norm: 0.3787 (0.4221)  closs: 0.9384 (0.9230)  time: 1.0921  data: 0.0002  max mem: 17563
+[01:51:01.700960] Epoch: [1]  [780/812]  lr: 0.000040  grad_norm: 0.3964 (0.4226)  closs: 0.8583 (0.9227)  time: 1.0883  data: 0.0002  max mem: 17563
+[01:51:12.543296] Epoch: [1]  [790/812]  lr: 0.000039  grad_norm: 0.4172 (0.4225)  closs: 0.9070 (0.9229)  time: 1.0850  data: 0.0001  max mem: 17563
+[01:51:23.433881] Epoch: [1]  [800/812]  lr: 0.000039  grad_norm: 0.4172 (0.4227)  closs: 0.9150 (0.9225)  time: 1.0866  data: 0.0001  max mem: 17563
+[01:51:34.266174] Epoch: [1]  [810/812]  lr: 0.000039  grad_norm: 0.4238 (0.4232)  closs: 0.9150 (0.9225)  time: 1.0861  data: 0.0001  max mem: 17563
+[01:51:35.625120] Epoch: [1] Total time: 0:14:45
+[01:51:35.641527] Averaged stats: lr: 0.000039  grad_norm: 0.4238 (0.4231)  closs: 0.9164 (0.9194)
+[01:51:35.700824] model saved
+[01:51:36.391314] optimizer saved
+[01:51:36.391767] other rank-common saved
+[01:51:36.393341] rank-specific saved
+[01:51:36.397531] log_dir: ./output_dir
+[01:51:38.765850] Epoch: [2]  [0/812]  lr: 0.000039  grad_norm: 0.4308 (0.4308)  closs: 0.8720 (0.8720)  time: 2.3674  data: 1.2568  max mem: 17563
+[01:51:49.643979] Epoch: [2]  [10/812]  lr: 0.000038  grad_norm: 0.4273 (0.4116)  closs: 0.8873 (0.9060)  time: 1.2041  data: 0.1144  max mem: 17563
+[01:52:00.424263] Epoch: [2]  [20/812]  lr: 0.000038  grad_norm: 0.4013 (0.4131)  closs: 0.8921 (0.8999)  time: 1.0829  data: 0.0002  max mem: 17563
+[01:52:11.196775] Epoch: [2]  [30/812]  lr: 0.000038  grad_norm: 0.4013 (0.4098)  closs: 0.8966 (0.9069)  time: 1.0776  data: 0.0002  max mem: 17563
+[01:52:22.046734] Epoch: [2]  [40/812]  lr: 0.000038  grad_norm: 0.4087 (0.4220)  closs: 0.9032 (0.9065)  time: 1.0811  data: 0.0002  max mem: 17563
+[01:52:32.833783] Epoch: [2]  [50/812]  lr: 0.000037  grad_norm: 0.4287 (0.4248)  closs: 0.9085 (0.9040)  time: 1.0818  data: 0.0002  max mem: 17563
+[01:52:43.693301] Epoch: [2]  [60/812]  lr: 0.000037  grad_norm: 0.4213 (0.4241)  closs: 0.9320 (0.9076)  time: 1.0823  data: 0.0002  max mem: 17563
+[01:52:54.454634] Epoch: [2]  [70/812]  lr: 0.000037  grad_norm: 0.4030 (0.4262)  closs: 0.9246 (0.9125)  time: 1.0810  data: 0.0002  max mem: 17563
+[01:53:05.308252] Epoch: [2]  [80/812]  lr: 0.000037  grad_norm: 0.4046 (0.4238)  closs: 0.9040 (0.9099)  time: 1.0807  data: 0.0002  max mem: 17563
+[01:53:16.131876] Epoch: [2]  [90/812]  lr: 0.000036  grad_norm: 0.4062 (0.4232)  closs: 0.8718 (0.9071)  time: 1.0838  data: 0.0002  max mem: 17563
+[01:53:27.101904] Epoch: [2]  [100/812]  lr: 0.000036  grad_norm: 0.4088 (0.4233)  closs: 0.8718 (0.9085)  time: 1.0896  data: 0.0002  max mem: 17563
+[01:53:37.951020] Epoch: [2]  [110/812]  lr: 0.000036  grad_norm: 0.4034 (0.4227)  closs: 0.8835 (0.9034)  time: 1.0909  data: 0.0002  max mem: 17563
+[01:53:48.791842] Epoch: [2]  [120/812]  lr: 0.000036  grad_norm: 0.3949 (0.4206)  closs: 0.8952 (0.9047)  time: 1.0844  data: 0.0002  max mem: 17563
+[01:53:59.632784] Epoch: [2]  [130/812]  lr: 0.000035  grad_norm: 0.3865 (0.4186)  closs: 0.8820 (0.9033)  time: 1.0840  data: 0.0002  max mem: 17563
+[01:54:10.462481] Epoch: [2]  [140/812]  lr: 0.000035  grad_norm: 0.3865 (0.4176)  closs: 0.8681 (0.9026)  time: 1.0835  data: 0.0002  max mem: 17563
+[01:54:21.299678] Epoch: [2]  [150/812]  lr: 0.000035  grad_norm: 0.3883 (0.4179)  closs: 0.8821 (0.9037)  time: 1.0833  data: 0.0002  max mem: 17563
+[01:54:32.133168] Epoch: [2]  [160/812]  lr: 0.000035  grad_norm: 0.3883 (0.4174)  closs: 0.9362 (0.9048)  time: 1.0835  data: 0.0002  max mem: 17563
+[01:54:43.049102] Epoch: [2]  [170/812]  lr: 0.000034  grad_norm: 0.3878 (0.4167)  closs: 0.8898 (0.9036)  time: 1.0874  data: 0.0002  max mem: 17563
+[01:54:53.915941] Epoch: [2]  [180/812]  lr: 0.000034  grad_norm: 0.4010 (0.4158)  closs: 0.8534 (0.9029)  time: 1.0891  data: 0.0002  max mem: 17563
+[01:55:04.769991] Epoch: [2]  [190/812]  lr: 0.000034  grad_norm: 0.4093 (0.4189)  closs: 0.9012 (0.9044)  time: 1.0860  data: 0.0002  max mem: 17563
+[01:55:15.615925] Epoch: [2]  [200/812]  lr: 0.000033  grad_norm: 0.4076 (0.4203)  closs: 0.9158 (0.9061)  time: 1.0849  data: 0.0002  max mem: 17563
+[01:55:26.497841] Epoch: [2]  [210/812]  lr: 0.000033  grad_norm: 0.4065 (0.4216)  closs: 0.9073 (0.9066)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:55:37.343158] Epoch: [2]  [220/812]  lr: 0.000033  grad_norm: 0.4200 (0.4232)  closs: 0.9341 (0.9079)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:55:48.278472] Epoch: [2]  [230/812]  lr: 0.000033  grad_norm: 0.4200 (0.4235)  closs: 0.9294 (0.9084)  time: 1.0890  data: 0.0002  max mem: 17563
+[01:55:59.129091] Epoch: [2]  [240/812]  lr: 0.000032  grad_norm: 0.4072 (0.4236)  closs: 0.8965 (0.9083)  time: 1.0892  data: 0.0002  max mem: 17563
+[01:56:10.001149] Epoch: [2]  [250/812]  lr: 0.000032  grad_norm: 0.3972 (0.4229)  closs: 0.8913 (0.9088)  time: 1.0861  data: 0.0002  max mem: 17563
+[01:56:20.856739] Epoch: [2]  [260/812]  lr: 0.000032  grad_norm: 0.4202 (0.4234)  closs: 0.8602 (0.9085)  time: 1.0863  data: 0.0002  max mem: 17563
+[01:56:31.687201] Epoch: [2]  [270/812]  lr: 0.000031  grad_norm: 0.4245 (0.4227)  closs: 0.8610 (0.9098)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:56:42.523986] Epoch: [2]  [280/812]  lr: 0.000031  grad_norm: 0.4059 (0.4224)  closs: 0.9101 (0.9114)  time: 1.0833  data: 0.0002  max mem: 17563
+[01:56:53.373471] Epoch: [2]  [290/812]  lr: 0.000031  grad_norm: 0.4080 (0.4223)  closs: 0.8769 (0.9081)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:57:04.298846] Epoch: [2]  [300/812]  lr: 0.000031  grad_norm: 0.4092 (0.4218)  closs: 0.8855 (0.9079)  time: 1.0887  data: 0.0002  max mem: 17563
+[01:57:15.135896] Epoch: [2]  [310/812]  lr: 0.000030  grad_norm: 0.4092 (0.4234)  closs: 0.9208 (0.9080)  time: 1.0880  data: 0.0002  max mem: 17563
+[01:57:25.967555] Epoch: [2]  [320/812]  lr: 0.000030  grad_norm: 0.3988 (0.4233)  closs: 0.9238 (0.9095)  time: 1.0834  data: 0.0002  max mem: 17563
+[01:57:36.782449] Epoch: [2]  [330/812]  lr: 0.000030  grad_norm: 0.4307 (0.4249)  closs: 0.9413 (0.9110)  time: 1.0823  data: 0.0002  max mem: 17563
+[01:57:47.653331] Epoch: [2]  [340/812]  lr: 0.000029  grad_norm: 0.4349 (0.4252)  closs: 0.9600 (0.9121)  time: 1.0842  data: 0.0002  max mem: 17563
+[01:57:58.502839] Epoch: [2]  [350/812]  lr: 0.000029  grad_norm: 0.4147 (0.4250)  closs: 0.9340 (0.9119)  time: 1.0859  data: 0.0002  max mem: 17563
+[01:58:09.451401] Epoch: [2]  [360/812]  lr: 0.000029  grad_norm: 0.4077 (0.4246)  closs: 0.9024 (0.9128)  time: 1.0898  data: 0.0002  max mem: 17563
+[01:58:20.271015] Epoch: [2]  [370/812]  lr: 0.000029  grad_norm: 0.4077 (0.4243)  closs: 0.9013 (0.9127)  time: 1.0883  data: 0.0002  max mem: 17563
+[01:58:31.122124] Epoch: [2]  [380/812]  lr: 0.000028  grad_norm: 0.4193 (0.4244)  closs: 0.9082 (0.9120)  time: 1.0835  data: 0.0002  max mem: 17563
+[01:58:41.965781] Epoch: [2]  [390/812]  lr: 0.000028  grad_norm: 0.4238 (0.4249)  closs: 0.9083 (0.9105)  time: 1.0847  data: 0.0002  max mem: 17563
+[01:58:52.791949] Epoch: [2]  [400/812]  lr: 0.000028  grad_norm: 0.4226 (0.4252)  closs: 0.8808 (0.9103)  time: 1.0834  data: 0.0002  max mem: 17563
+[01:59:03.649565] Epoch: [2]  [410/812]  lr: 0.000027  grad_norm: 0.4266 (0.4251)  closs: 0.8843 (0.9101)  time: 1.0841  data: 0.0002  max mem: 17563
+[01:59:14.539085] Epoch: [2]  [420/812]  lr: 0.000027  grad_norm: 0.4235 (0.4253)  closs: 0.8703 (0.9101)  time: 1.0873  data: 0.0002  max mem: 17563
+[01:59:25.436742] Epoch: [2]  [430/812]  lr: 0.000027  grad_norm: 0.4027 (0.4252)  closs: 0.8717 (0.9107)  time: 1.0893  data: 0.0002  max mem: 17563
+[01:59:36.307301] Epoch: [2]  [440/812]  lr: 0.000027  grad_norm: 0.4181 (0.4259)  closs: 0.9164 (0.9105)  time: 1.0883  data: 0.0002  max mem: 17563
+[01:59:47.106471] Epoch: [2]  [450/812]  lr: 0.000026  grad_norm: 0.4380 (0.4258)  closs: 0.9270 (0.9118)  time: 1.0834  data: 0.0002  max mem: 17563
+[01:59:57.969495] Epoch: [2]  [460/812]  lr: 0.000026  grad_norm: 0.4109 (0.4264)  closs: 0.9670 (0.9138)  time: 1.0830  data: 0.0002  max mem: 17563
+[02:00:08.857151] Epoch: [2]  [470/812]  lr: 0.000026  grad_norm: 0.4159 (0.4406)  closs: 0.9521 (0.9133)  time: 1.0875  data: 0.0002  max mem: 17563
+[02:00:19.720928] Epoch: [2]  [480/812]  lr: 0.000025  grad_norm: 0.4153 (0.4403)  closs: 0.9305 (0.9144)  time: 1.0875  data: 0.0002  max mem: 17563
+[02:00:30.663962] Epoch: [2]  [490/812]  lr: 0.000025  grad_norm: 0.3940 (0.4397)  closs: 0.9178 (0.9137)  time: 1.0903  data: 0.0002  max mem: 17563
+[02:00:41.489372] Epoch: [2]  [500/812]  lr: 0.000025  grad_norm: 0.3940 (0.4394)  closs: 0.8950 (0.9133)  time: 1.0883  data: 0.0002  max mem: 17563
+[02:00:52.333520] Epoch: [2]  [510/812]  lr: 0.000024  grad_norm: 0.4045 (0.4389)  closs: 0.8950 (0.9133)  time: 1.0834  data: 0.0002  max mem: 17563
+[02:01:03.175443] Epoch: [2]  [520/812]  lr: 0.000024  grad_norm: 0.4021 (0.4383)  closs: 0.8764 (0.9133)  time: 1.0842  data: 0.0002  max mem: 17563
+[02:01:14.038903] Epoch: [2]  [530/812]  lr: 0.000024  grad_norm: 0.4054 (0.4382)  closs: 0.9012 (0.9132)  time: 1.0852  data: 0.0002  max mem: 17563
+[02:01:24.892673] Epoch: [2]  [540/812]  lr: 0.000024  grad_norm: 0.4148 (0.4375)  closs: 0.8972 (0.9131)  time: 1.0858  data: 0.0002  max mem: 17563
+[02:01:35.779226] Epoch: [2]  [550/812]  lr: 0.000023  grad_norm: 0.3733 (0.4364)  closs: 0.8972 (0.9130)  time: 1.0869  data: 0.0002  max mem: 17563
+[02:01:46.730439] Epoch: [2]  [560/812]  lr: 0.000023  grad_norm: 0.3960 (0.4364)  closs: 0.8940 (0.9132)  time: 1.0918  data: 0.0002  max mem: 17563
+[02:01:57.589054] Epoch: [2]  [570/812]  lr: 0.000023  grad_norm: 0.4338 (0.4367)  closs: 0.8873 (0.9125)  time: 1.0904  data: 0.0002  max mem: 17563
+[02:02:08.438216] Epoch: [2]  [580/812]  lr: 0.000022  grad_norm: 0.4207 (0.4365)  closs: 0.8640 (0.9114)  time: 1.0853  data: 0.0002  max mem: 17563
+[02:02:19.300511] Epoch: [2]  [590/812]  lr: 0.000022  grad_norm: 0.4164 (0.4366)  closs: 0.8945 (0.9105)  time: 1.0855  data: 0.0002  max mem: 17563
+[02:02:30.163210] Epoch: [2]  [600/812]  lr: 0.000022  grad_norm: 0.4080 (0.4360)  closs: 0.8987 (0.9106)  time: 1.0862  data: 0.0002  max mem: 17563
+[02:02:40.965411] Epoch: [2]  [610/812]  lr: 0.000022  grad_norm: 0.4122 (0.4359)  closs: 0.9033 (0.9105)  time: 1.0832  data: 0.0002  max mem: 17563
+[02:02:51.875860] Epoch: [2]  [620/812]  lr: 0.000021  grad_norm: 0.4149 (0.4355)  closs: 0.9033 (0.9103)  time: 1.0856  data: 0.0002  max mem: 17563
+[02:03:02.716853] Epoch: [2]  [630/812]  lr: 0.000021  grad_norm: 0.3996 (0.4353)  closs: 0.8798 (0.9102)  time: 1.0875  data: 0.0002  max mem: 17563
+[02:03:13.555206] Epoch: [2]  [640/812]  lr: 0.000021  grad_norm: 0.4151 (0.4353)  closs: 0.8893 (0.9102)  time: 1.0839  data: 0.0002  max mem: 17563
+[02:03:24.404228] Epoch: [2]  [650/812]  lr: 0.000021  grad_norm: 0.4151 (0.4359)  closs: 0.8497 (0.9087)  time: 1.0843  data: 0.0002  max mem: 17563
+[02:03:35.236727] Epoch: [2]  [660/812]  lr: 0.000020  grad_norm: 0.3994 (0.4354)  closs: 0.8437 (0.9080)  time: 1.0840  data: 0.0002  max mem: 17563
+[02:03:46.089429] Epoch: [2]  [670/812]  lr: 0.000020  grad_norm: 0.3948 (0.4353)  closs: 0.8686 (0.9075)  time: 1.0842  data: 0.0002  max mem: 17563
+[02:03:56.967232] Epoch: [2]  [680/812]  lr: 0.000020  grad_norm: 0.3992 (0.4350)  closs: 0.9080 (0.9081)  time: 1.0865  data: 0.0002  max mem: 17563
+[02:04:07.889221] Epoch: [2]  [690/812]  lr: 0.000019  grad_norm: 0.4095 (0.4347)  closs: 0.9402 (0.9083)  time: 1.0899  data: 0.0002  max mem: 17563
+[02:04:18.752487] Epoch: [2]  [700/812]  lr: 0.000019  grad_norm: 0.4252 (0.4347)  closs: 0.8932 (0.9080)  time: 1.0892  data: 0.0002  max mem: 17563
+[02:04:29.575645] Epoch: [2]  [710/812]  lr: 0.000019  grad_norm: 0.4252 (0.4346)  closs: 0.8679 (0.9075)  time: 1.0842  data: 0.0002  max mem: 17563
+[02:04:40.433907] Epoch: [2]  [720/812]  lr: 0.000019  grad_norm: 0.4004 (0.4343)  closs: 0.8577 (0.9075)  time: 1.0840  data: 0.0002  max mem: 17563
+[02:04:51.273096] Epoch: [2]  [730/812]  lr: 0.000018  grad_norm: 0.4196 (0.4342)  closs: 0.8849 (0.9083)  time: 1.0848  data: 0.0002  max mem: 17563
+[02:05:02.094971] Epoch: [2]  [740/812]  lr: 0.000018  grad_norm: 0.4285 (0.4345)  closs: 0.8671 (0.9082)  time: 1.0830  data: 0.0001  max mem: 17563
+[02:05:13.024661] Epoch: [2]  [750/812]  lr: 0.000018  grad_norm: 0.4137 (0.4345)  closs: 0.8708 (0.9081)  time: 1.0875  data: 0.0002  max mem: 17563
+[02:05:23.832680] Epoch: [2]  [760/812]  lr: 0.000018  grad_norm: 0.4091 (0.4344)  closs: 0.8956 (0.9075)  time: 1.0868  data: 0.0002  max mem: 17563
+[02:05:34.663264] Epoch: [2]  [770/812]  lr: 0.000017  grad_norm: 0.4201 (0.4345)  closs: 0.9148 (0.9082)  time: 1.0819  data: 0.0002  max mem: 17563
+[02:05:45.512165] Epoch: [2]  [780/812]  lr: 0.000017  grad_norm: 0.4217 (0.4345)  closs: 0.9572 (0.9087)  time: 1.0839  data: 0.0002  max mem: 17563
+[02:05:56.436831] Epoch: [2]  [790/812]  lr: 0.000017  grad_norm: 0.3903 (0.4340)  closs: 0.9227 (0.9091)  time: 1.0886  data: 0.0001  max mem: 17563
+[02:06:07.316986] Epoch: [2]  [800/812]  lr: 0.000017  grad_norm: 0.4042 (0.4339)  closs: 0.9078 (0.9094)  time: 1.0902  data: 0.0001  max mem: 17563
+[02:06:18.230131] Epoch: [2]  [810/812]  lr: 0.000016  grad_norm: 0.4042 (0.4338)  closs: 0.9269 (0.9098)  time: 1.0896  data: 0.0001  max mem: 17563
+[02:06:19.593496] Epoch: [2] Total time: 0:14:43
+[02:06:19.597975] Averaged stats: lr: 0.000016  grad_norm: 0.4042 (0.4338)  closs: 0.9269 (0.9098)
+[02:06:19.659399] model saved
+[02:06:20.282609] optimizer saved
+[02:06:20.283084] other rank-common saved
+[02:06:20.284711] rank-specific saved
+[02:06:20.288956] log_dir: ./output_dir
+[02:06:22.710850] Epoch: [3]  [0/812]  lr: 0.000016  grad_norm: 0.4176 (0.4176)  closs: 0.6748 (0.6748)  time: 2.4210  data: 1.3135  max mem: 17563
+[02:06:33.519859] Epoch: [3]  [10/812]  lr: 0.000016  grad_norm: 0.4176 (0.4223)  closs: 0.9734 (0.9533)  time: 1.2027  data: 0.1195  max mem: 17563
+[02:06:44.266283] Epoch: [3]  [20/812]  lr: 0.000016  grad_norm: 0.3989 (0.4313)  closs: 0.9278 (0.9248)  time: 1.0777  data: 0.0001  max mem: 17563
+[02:06:55.019207] Epoch: [3]  [30/812]  lr: 0.000016  grad_norm: 0.3960 (0.4187)  closs: 0.9133 (0.9231)  time: 1.0749  data: 0.0002  max mem: 17563
+[02:07:05.786964] Epoch: [3]  [40/812]  lr: 0.000015  grad_norm: 0.4012 (0.4199)  closs: 0.9042 (0.9126)  time: 1.0760  data: 0.0002  max mem: 17563
+[02:07:16.545258] Epoch: [3]  [50/812]  lr: 0.000015  grad_norm: 0.4237 (0.4220)  closs: 0.9111 (0.9140)  time: 1.0762  data: 0.0002  max mem: 17563
+[02:07:27.329602] Epoch: [3]  [60/812]  lr: 0.000015  grad_norm: 0.4194 (0.4208)  closs: 0.9097 (0.9181)  time: 1.0771  data: 0.0002  max mem: 17563
+[02:07:38.193921] Epoch: [3]  [70/812]  lr: 0.000015  grad_norm: 0.4148 (0.4200)  closs: 0.8976 (0.9157)  time: 1.0824  data: 0.0003  max mem: 17563
+[02:07:49.051932] Epoch: [3]  [80/812]  lr: 0.000014  grad_norm: 0.4067 (0.4203)  closs: 0.9098 (0.9146)  time: 1.0860  data: 0.0003  max mem: 17563
+[02:07:59.806022] Epoch: [3]  [90/812]  lr: 0.000014  grad_norm: 0.4169 (0.4256)  closs: 0.9106 (0.9134)  time: 1.0805  data: 0.0002  max mem: 17563
+[02:08:10.644450] Epoch: [3]  [100/812]  lr: 0.000014  grad_norm: 0.4417 (0.4269)  closs: 0.9029 (0.9120)  time: 1.0796  data: 0.0002  max mem: 17563
+[02:08:21.501936] Epoch: [3]  [110/812]  lr: 0.000014  grad_norm: 0.4361 (0.4298)  closs: 0.8539 (0.9054)  time: 1.0847  data: 0.0002  max mem: 17563
+[02:08:32.378222] Epoch: [3]  [120/812]  lr: 0.000013  grad_norm: 0.4314 (0.4294)  closs: 0.8574 (0.9092)  time: 1.0866  data: 0.0002  max mem: 17563
+[02:08:43.200717] Epoch: [3]  [130/812]  lr: 0.000013  grad_norm: 0.4217 (0.4289)  closs: 0.8647 (0.9055)  time: 1.0849  data: 0.0002  max mem: 17563
+[02:08:54.073480] Epoch: [3]  [140/812]  lr: 0.000013  grad_norm: 0.4077 (0.4268)  closs: 0.8831 (0.9097)  time: 1.0847  data: 0.0002  max mem: 17563
+[02:09:04.989287] Epoch: [3]  [150/812]  lr: 0.000013  grad_norm: 0.4010 (0.4264)  closs: 0.9527 (0.9097)  time: 1.0894  data: 0.0002  max mem: 17563
+[02:09:15.881253] Epoch: [3]  [160/812]  lr: 0.000012  grad_norm: 0.4010 (0.4247)  closs: 0.8847 (0.9097)  time: 1.0903  data: 0.0002  max mem: 17563
+[02:09:26.738896] Epoch: [3]  [170/812]  lr: 0.000012  grad_norm: 0.4141 (0.4242)  closs: 0.8847 (0.9086)  time: 1.0874  data: 0.0002  max mem: 17563
+[02:09:37.549785] Epoch: [3]  [180/812]  lr: 0.000012  grad_norm: 0.4163 (0.4252)  closs: 0.8829 (0.9072)  time: 1.0833  data: 0.0002  max mem: 17563
+[02:09:48.429478] Epoch: [3]  [190/812]  lr: 0.000012  grad_norm: 0.4051 (0.4250)  closs: 0.8713 (0.9066)  time: 1.0844  data: 0.0002  max mem: 17563
+[02:09:59.361112] Epoch: [3]  [200/812]  lr: 0.000012  grad_norm: 0.3976 (0.4232)  closs: 0.9178 (0.9084)  time: 1.0905  data: 0.0002  max mem: 17563
+[02:10:10.221693] Epoch: [3]  [210/812]  lr: 0.000011  grad_norm: 0.3976 (0.4228)  closs: 0.9029 (0.9076)  time: 1.0895  data: 0.0002  max mem: 17563
+[02:10:20.991480] Epoch: [3]  [220/812]  lr: 0.000011  grad_norm: 0.4122 (0.4218)  closs: 0.8810 (0.9092)  time: 1.0814  data: 0.0002  max mem: 17563
+[02:10:31.909511] Epoch: [3]  [230/812]  lr: 0.000011  grad_norm: 0.3948 (0.4208)  closs: 0.9004 (0.9098)  time: 1.0843  data: 0.0002  max mem: 17563
+[02:10:42.753050] Epoch: [3]  [240/812]  lr: 0.000011  grad_norm: 0.4029 (0.4285)  closs: 0.8996 (0.9098)  time: 1.0880  data: 0.0002  max mem: 17563
+[02:10:53.652353] Epoch: [3]  [250/812]  lr: 0.000011  grad_norm: 0.4152 (0.4287)  closs: 0.8878 (0.9093)  time: 1.0871  data: 0.0002  max mem: 17563
+[02:11:04.489927] Epoch: [3]  [260/812]  lr: 0.000010  grad_norm: 0.4357 (0.4288)  closs: 0.8396 (0.9074)  time: 1.0868  data: 0.0002  max mem: 17563
+[02:11:15.354091] Epoch: [3]  [270/812]  lr: 0.000010  grad_norm: 0.4116 (0.4280)  closs: 0.8460 (0.9079)  time: 1.0850  data: 0.0002  max mem: 17563
+[02:11:26.291518] Epoch: [3]  [280/812]  lr: 0.000010  grad_norm: 0.4071 (0.4279)  closs: 0.8640 (0.9096)  time: 1.0900  data: 0.0002  max mem: 17563
+[02:11:37.145372] Epoch: [3]  [290/812]  lr: 0.000010  grad_norm: 0.4271 (0.4289)  closs: 0.8646 (0.9088)  time: 1.0895  data: 0.0002  max mem: 17563
+[02:11:47.992029] Epoch: [3]  [300/812]  lr: 0.000010  grad_norm: 0.4576 (0.4295)  closs: 0.8555 (0.9060)  time: 1.0849  data: 0.0002  max mem: 17563
+[02:11:58.846182] Epoch: [3]  [310/812]  lr: 0.000010  grad_norm: 0.4213 (0.4291)  closs: 0.8791 (0.9076)  time: 1.0850  data: 0.0004  max mem: 17563
+[02:12:09.703196] Epoch: [3]  [320/812]  lr: 0.000009  grad_norm: 0.4203 (0.4295)  closs: 0.9359 (0.9080)  time: 1.0855  data: 0.0004  max mem: 17563
+[02:12:20.668253] Epoch: [3]  [330/812]  lr: 0.000009  grad_norm: 0.4150 (0.4288)  closs: 0.9336 (0.9078)  time: 1.0910  data: 0.0002  max mem: 17563
+[02:12:31.500169] Epoch: [3]  [340/812]  lr: 0.000009  grad_norm: 0.4150 (0.4283)  closs: 0.9348 (0.9082)  time: 1.0898  data: 0.0002  max mem: 17563
+[02:12:42.261952] Epoch: [3]  [350/812]  lr: 0.000009  grad_norm: 0.4078 (0.4279)  closs: 0.9334 (0.9088)  time: 1.0796  data: 0.0002  max mem: 17563
+[02:12:53.174413] Epoch: [3]  [360/812]  lr: 0.000009  grad_norm: 0.4009 (0.4282)  closs: 0.9129 (0.9094)  time: 1.0836  data: 0.0002  max mem: 17563
+[02:13:04.026514] Epoch: [3]  [370/812]  lr: 0.000009  grad_norm: 0.4036 (0.4283)  closs: 0.9507 (0.9097)  time: 1.0882  data: 0.0002  max mem: 17563
+[02:13:14.864316] Epoch: [3]  [380/812]  lr: 0.000008  grad_norm: 0.4236 (0.4280)  closs: 0.8826 (0.9087)  time: 1.0844  data: 0.0002  max mem: 17563
+[02:13:25.695888] Epoch: [3]  [390/812]  lr: 0.000008  grad_norm: 0.4236 (0.4280)  closs: 0.8234 (0.9077)  time: 1.0834  data: 0.0002  max mem: 17563
+[02:13:36.522555] Epoch: [3]  [400/812]  lr: 0.000008  grad_norm: 0.4061 (0.4279)  closs: 0.8981 (0.9082)  time: 1.0828  data: 0.0002  max mem: 17563
+[02:13:47.446926] Epoch: [3]  [410/812]  lr: 0.000008  grad_norm: 0.4061 (0.4276)  closs: 0.8877 (0.9069)  time: 1.0875  data: 0.0002  max mem: 17563
+[02:13:58.297640] Epoch: [3]  [420/812]  lr: 0.000008  grad_norm: 0.4293 (0.4275)  closs: 0.8711 (0.9062)  time: 1.0887  data: 0.0002  max mem: 17563
+[02:14:09.250109] Epoch: [3]  [430/812]  lr: 0.000008  grad_norm: 0.4153 (0.4273)  closs: 0.8631 (0.9048)  time: 1.0901  data: 0.0002  max mem: 17563
+[02:14:20.064220] Epoch: [3]  [440/812]  lr: 0.000008  grad_norm: 0.4153 (0.4271)  closs: 0.8631 (0.9042)  time: 1.0883  data: 0.0002  max mem: 17563
+[02:14:30.897325] Epoch: [3]  [450/812]  lr: 0.000007  grad_norm: 0.4201 (0.4277)  closs: 0.8829 (0.9038)  time: 1.0823  data: 0.0002  max mem: 17563
+[02:14:41.784349] Epoch: [3]  [460/812]  lr: 0.000007  grad_norm: 0.4253 (0.4280)  closs: 0.8706 (0.9030)  time: 1.0859  data: 0.0002  max mem: 17563
+[02:14:52.658859] Epoch: [3]  [470/812]  lr: 0.000007  grad_norm: 0.4084 (0.4283)  closs: 0.8706 (0.9027)  time: 1.0880  data: 0.0002  max mem: 17563
+[02:15:03.399053] Epoch: [3]  [480/812]  lr: 0.000007  grad_norm: 0.4084 (0.4292)  closs: 0.8693 (0.9023)  time: 1.0807  data: 0.0002  max mem: 17563
+[02:15:14.332241] Epoch: [3]  [490/812]  lr: 0.000007  grad_norm: 0.4107 (0.4287)  closs: 0.8744 (0.9026)  time: 1.0836  data: 0.0002  max mem: 17563
+[02:15:25.185289] Epoch: [3]  [500/812]  lr: 0.000007  grad_norm: 0.3996 (0.4288)  closs: 0.8780 (0.9021)  time: 1.0892  data: 0.0002  max mem: 17563
+[02:15:36.055738] Epoch: [3]  [510/812]  lr: 0.000007  grad_norm: 0.3932 (0.4287)  closs: 0.9032 (0.9035)  time: 1.0861  data: 0.0002  max mem: 17563
+[02:15:46.875809] Epoch: [3]  [520/812]  lr: 0.000007  grad_norm: 0.4266 (0.4289)  closs: 0.9807 (0.9043)  time: 1.0845  data: 0.0002  max mem: 17563
+[02:15:57.742641] Epoch: [3]  [530/812]  lr: 0.000006  grad_norm: 0.4378 (0.4295)  closs: 0.8574 (0.9035)  time: 1.0843  data: 0.0002  max mem: 17563
+[02:16:08.662372] Epoch: [3]  [540/812]  lr: 0.000006  grad_norm: 0.4378 (0.4297)  closs: 0.8314 (0.9034)  time: 1.0893  data: 0.0002  max mem: 17563
+[02:16:19.574885] Epoch: [3]  [550/812]  lr: 0.000006  grad_norm: 0.4160 (0.4300)  closs: 0.8906 (0.9025)  time: 1.0915  data: 0.0002  max mem: 17563
+[02:16:30.444433] Epoch: [3]  [560/812]  lr: 0.000006  grad_norm: 0.4111 (0.4297)  closs: 0.8671 (0.9019)  time: 1.0890  data: 0.0002  max mem: 17563
+[02:16:41.256177] Epoch: [3]  [570/812]  lr: 0.000006  grad_norm: 0.4156 (0.4300)  closs: 0.8896 (0.9025)  time: 1.0840  data: 0.0002  max mem: 17563
+[02:16:52.063035] Epoch: [3]  [580/812]  lr: 0.000006  grad_norm: 0.4278 (0.4300)  closs: 0.9103 (0.9030)  time: 1.0809  data: 0.0002  max mem: 17563
+[02:17:02.975320] Epoch: [3]  [590/812]  lr: 0.000006  grad_norm: 0.4202 (0.4300)  closs: 0.8944 (0.9028)  time: 1.0859  data: 0.0002  max mem: 17563
+[02:17:13.808127] Epoch: [3]  [600/812]  lr: 0.000006  grad_norm: 0.4237 (0.4300)  closs: 0.8329 (0.9024)  time: 1.0872  data: 0.0002  max mem: 17563
+[02:17:24.593205] Epoch: [3]  [610/812]  lr: 0.000006  grad_norm: 0.4493 (0.4302)  closs: 0.9092 (0.9032)  time: 1.0808  data: 0.0002  max mem: 17563
+[02:17:35.534419] Epoch: [3]  [620/812]  lr: 0.000006  grad_norm: 0.4336 (0.4304)  closs: 0.8930 (0.9035)  time: 1.0862  data: 0.0002  max mem: 17563
+[02:17:46.375586] Epoch: [3]  [630/812]  lr: 0.000006  grad_norm: 0.4340 (0.4311)  closs: 0.8824 (0.9027)  time: 1.0890  data: 0.0002  max mem: 17563
+[02:17:57.232787] Epoch: [3]  [640/812]  lr: 0.000006  grad_norm: 0.4289 (0.4309)  closs: 0.8533 (0.9024)  time: 1.0848  data: 0.0002  max mem: 17563
+[02:18:08.059916] Epoch: [3]  [650/812]  lr: 0.000005  grad_norm: 0.4289 (0.4312)  closs: 0.9116 (0.9035)  time: 1.0841  data: 0.0002  max mem: 17563
+[02:18:18.919927] Epoch: [3]  [660/812]  lr: 0.000005  grad_norm: 0.4250 (0.4311)  closs: 0.9397 (0.9042)  time: 1.0843  data: 0.0002  max mem: 17563
+[02:18:29.845536] Epoch: [3]  [670/812]  lr: 0.000005  grad_norm: 0.4118 (0.4311)  closs: 0.8926 (0.9034)  time: 1.0892  data: 0.0002  max mem: 17563
+[02:18:40.681588] Epoch: [3]  [680/812]  lr: 0.000005  grad_norm: 0.4088 (0.4308)  closs: 0.8901 (0.9040)  time: 1.0880  data: 0.0002  max mem: 17563
+[02:18:51.531737] Epoch: [3]  [690/812]  lr: 0.000005  grad_norm: 0.3880 (0.4305)  closs: 0.8910 (0.9038)  time: 1.0842  data: 0.0002  max mem: 17563
+[02:19:02.397259] Epoch: [3]  [700/812]  lr: 0.000005  grad_norm: 0.4102 (0.4306)  closs: 0.8910 (0.9042)  time: 1.0857  data: 0.0002  max mem: 17563
+[02:19:13.242914] Epoch: [3]  [710/812]  lr: 0.000005  grad_norm: 0.4370 (0.4310)  closs: 0.9252 (0.9046)  time: 1.0855  data: 0.0002  max mem: 17563
+[02:19:24.166388] Epoch: [3]  [720/812]  lr: 0.000005  grad_norm: 0.4094 (0.4308)  closs: 0.9252 (0.9048)  time: 1.0884  data: 0.0002  max mem: 17563
+[02:19:35.054899] Epoch: [3]  [730/812]  lr: 0.000005  grad_norm: 0.4150 (0.4309)  closs: 0.8867 (0.9040)  time: 1.0905  data: 0.0003  max mem: 17563
+[02:19:45.825930] Epoch: [3]  [740/812]  lr: 0.000005  grad_norm: 0.4151 (0.4305)  closs: 0.8867 (0.9045)  time: 1.0829  data: 0.0003  max mem: 17563
+[02:19:56.768129] Epoch: [3]  [750/812]  lr: 0.000005  grad_norm: 0.4089 (0.4305)  closs: 0.8809 (0.9040)  time: 1.0856  data: 0.0002  max mem: 17563
+[02:20:07.605720] Epoch: [3]  [760/812]  lr: 0.000005  grad_norm: 0.4198 (0.4309)  closs: 0.8766 (0.9045)  time: 1.0889  data: 0.0002  max mem: 17563
+[02:20:18.461056] Epoch: [3]  [770/812]  lr: 0.000005  grad_norm: 0.4191 (0.4308)  closs: 0.9031 (0.9046)  time: 1.0846  data: 0.0002  max mem: 17563
+[02:20:29.271451] Epoch: [3]  [780/812]  lr: 0.000005  grad_norm: 0.4191 (0.4312)  closs: 0.8966 (0.9044)  time: 1.0832  data: 0.0002  max mem: 17563
+[02:20:40.125154] Epoch: [3]  [790/812]  lr: 0.000005  grad_norm: 0.4262 (0.4314)  closs: 0.8927 (0.9045)  time: 1.0831  data: 0.0001  max mem: 17563
+[02:20:51.038156] Epoch: [3]  [800/812]  lr: 0.000005  grad_norm: 0.4216 (0.4313)  closs: 0.9266 (0.9048)  time: 1.0883  data: 0.0001  max mem: 17563
+[02:21:01.797044] Epoch: [3]  [810/812]  lr: 0.000005  grad_norm: 0.4304 (0.4317)  closs: 0.9818 (0.9056)  time: 1.0835  data: 0.0001  max mem: 17563
+[02:21:03.159536] Epoch: [3] Total time: 0:14:42
+[02:21:03.170294] Averaged stats: lr: 0.000005  grad_norm: 0.4321 (0.4319)  closs: 0.9721 (0.9059)
+[02:21:03.235300] model saved
+[02:21:03.979453] optimizer saved
+[02:21:03.980150] other rank-common saved
+[02:21:03.983130] rank-specific saved
+[02:21:03.983442] Training time 0:58:58