aws-neuron
/

optimum-neuron-cache

dacorvo HF Staff commited on 19 days ago

Commit

3b6312a

verified ·

1 Parent(s): 4103db7

Add batch size 4 configurations for LLama 1B and 3B models

Files changed (1) hide show

inference-cache-config/llama.json CHANGED Viewed

@@ -74,6 +74,14 @@
       "num_cores": 2,
       "auto_cast_type": "bf16"
     }
   ],
    "meta-llama/Llama-3.2-3B": [
     {
@@ -82,6 +90,14 @@
       "num_cores": 2,
       "auto_cast_type": "bf16"
     }
   ],
      "TinyLlama/TinyLlama-1.1B-Chat-v1.0": [
     {

       "num_cores": 2,
       "auto_cast_type": "bf16"
     }
+  ],
+     "meta-llama/Llama-3.2-1B": [
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 2,
+      "auto_cast_type": "bf16"
+    }
   ],
    "meta-llama/Llama-3.2-3B": [
     {
       "num_cores": 2,
       "auto_cast_type": "bf16"
     }
+  ],
+     "meta-llama/Llama-3.2-3B": [
+    {
+      "batch_size": 4,
+      "sequence_length": 4096,
+      "num_cores": 2,
+      "auto_cast_type": "bf16"
+    }
   ],
      "TinyLlama/TinyLlama-1.1B-Chat-v1.0": [
     {