thon | |
import torch | |
from transformers import AwqConfig, AutoModelForCausalLM | |
model_id = "TheBloke/Yi-34B-AWQ" | |
quantization_config = AwqConfig( | |
bits=4, | |
fuse_max_seq_len=512, | |
modules_to_fuse={ | |
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"], | |
"layernorm": ["ln1", "ln2", "norm"], | |
"mlp": ["gate_proj", "up_proj", "down_proj"], | |
"use_alibi": False, | |
"num_attention_heads": 56, | |
"num_key_value_heads": 8, | |
"hidden_size": 7168 | |
} | |
) | |
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0) | |
The parameter modules_to_fuse should include: | |
"attention": The names of the attention layers to fuse in the following order: query, key, value and output projection layer. |