huihui-ai
/

DeepSeek-R1-Distill-Qwen-Coder-32B-Fusion-9010

Text Generation

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

huihui-ai commited on Feb 22

Commit

2570c61

·

verified ·

1 Parent(s): 07cc489

Update README.md

Files changed (1) hide show

README.md +11 -5

README.md CHANGED Viewed

@@ -41,18 +41,24 @@ import torch
 # Load the model and tokenizer
 model_name = "huihui-ai/DeepSeek-R1-Distill-Qwen-Coder-32B-Fusion-9010"
-quant_config_4 = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
     llm_int8_enable_fp32_cpu_offload=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
-    quantization_config=quant_config_4,
     device_map="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

 # Load the model and tokenizer
 model_name = "huihui-ai/DeepSeek-R1-Distill-Qwen-Coder-32B-Fusion-9010"
+#quant_config_4 = BitsAndBytesConfig(
+#    load_in_4bit=True,
+#    bnb_4bit_compute_dtype=torch.bfloat16,
+#    bnb_4bit_use_double_quant=True,
+#    llm_int8_enable_fp32_cpu_offload=True,
+#)
+quant_config_8 = BitsAndBytesConfig(
+    load_in_8bit=True,
     llm_int8_enable_fp32_cpu_offload=True,
+    llm_int8_has_fp16_weight=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
+    quantization_config=quant_config_8,
     device_map="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)