RedHatAI
/

Llama-3.3-70B-Instruct-FP8-dynamic

@@ -56,7 +56,7 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
 from vllm import LLM, SamplingParams
 from transformers import AutoTokenizer
-model_id = "neuralmagic-ent/phi-4-FP8-dynamic"
 number_gpus = 1
 sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
@@ -136,7 +136,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -148,7 +148,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
     --tasks mmlu_cot_llama \
     --apply_chat_template \
     --num_fewshot 0 \
@@ -159,7 +159,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
     --tasks arc_challenge_llama \
     --apply_chat_template \
     --num_fewshot 0 \
@@ -170,7 +170,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
     --tasks gsm8k_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -182,7 +182,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks hellaswag \
     --num_fewshot 10 \
     --batch_size auto
@@ -192,7 +192,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks winogrande \
     --num_fewshot 5 \
     --batch_size auto
@@ -202,7 +202,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks truthfulqa \
     --num_fewshot 0 \
     --batch_size auto
@@ -212,7 +212,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,tensor_parallel_size=1,enable_chunked_prefill=True \
     --apply_chat_template \
     --fewshot_as_multiturn \
     --tasks leaderboard \
@@ -223,7 +223,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_pt_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -235,7 +235,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_es_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -247,7 +247,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_it_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -259,7 +259,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_de_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -271,7 +271,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_fr_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -283,7 +283,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_hi_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -295,7 +295,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   ```
   lm_eval \
     --model vllm \
-    --model_args pretrained="neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_th_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
@@ -307,7 +307,7 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   *Generation*
   ```
   python3 codegen/generate.py \
-    --model neuralmagic-ent/Llama-3.3-70B-Instruct-FP8-dynamic \
     --bs 16 \
     --temperature 0.2 \
     --n_samples 50 \
@@ -318,14 +318,14 @@ HumanEval and HumanEval+ evaluations were conducted using Neural Magic's fork of
   *Sanitization*
   ```
   python3 evalplus/sanitize.py \
-    humaneval/neuralmagic-ent--Llama-3.3-70B-Instruct-FP8-dynamic_vllm_temp_0.2
   ```
   *Evaluation*
   ```
   evalplus.evaluate \
     --dataset humaneval \
-    --samples humaneval/neuralmagic-ent--Llama-3.3-70B-Instruct-FP8-dynamic_vllm_temp_0.2-sanitized
   ```
 </details>

 from vllm import LLM, SamplingParams
 from transformers import AutoTokenizer
+model_id = "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
 number_gpus = 1
 sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
     --tasks mmlu_cot_llama \
     --apply_chat_template \
     --num_fewshot 0 \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
     --tasks arc_challenge_llama \
     --apply_chat_template \
     --num_fewshot 0 \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
     --tasks gsm8k_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks hellaswag \
     --num_fewshot 10 \
     --batch_size auto
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks winogrande \
     --num_fewshot 5 \
     --batch_size auto
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
     --tasks truthfulqa \
     --num_fewshot 0 \
     --batch_size auto
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,tensor_parallel_size=1,enable_chunked_prefill=True \
     --apply_chat_template \
     --fewshot_as_multiturn \
     --tasks leaderboard \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_pt_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_es_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_it_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_de_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_fr_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_hi_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   ```
   lm_eval \
     --model vllm \
+    --model_args pretrained="RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
     --tasks mmlu_th_llama \
     --fewshot_as_multiturn \
     --apply_chat_template \
   *Generation*
   ```
   python3 codegen/generate.py \
+    --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic \
     --bs 16 \
     --temperature 0.2 \
     --n_samples 50 \
   *Sanitization*
   ```
   python3 evalplus/sanitize.py \
+    humaneval/RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic_vllm_temp_0.2
   ```
   *Evaluation*
   ```
   evalplus.evaluate \
     --dataset humaneval \
+    --samples humaneval/RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic_vllm_temp_0.2-sanitized
   ```
 </details>