BitsAndBytes quantization inference error
#5
by
chengfy
- opened
BitsAndBytes 8-bit quantization is giving me errors (4-bit errors are similar). Here's my error log:
../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[2], line 22
6 inputs = tokenizer.apply_chat_template(
7 message,
8 return_tensors="pt",
9 add_generation_prompt=True,
10 return_dict=True,
11 ).to(model.device)
13 generate_kwargs = {
14 "input_ids": inputs["input_ids"],
15 "attention_mask": inputs["attention_mask"],
(...) 20 "temperature": 0.6,
21 }
---> 22 out = model.generate(**generate_kwargs)
23 generate_resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:-1], skip_special_tokens=False)
24 stop_sequence = tokenizer.decode(out[0][-1:], skip_speical_tokens=False)
File ~/**/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/**/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:2848, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)
2840 input_ids, model_kwargs = self._expand_inputs_for_generation(
2841 input_ids=input_ids,
2842 expand_size=generation_config.num_return_sequences,
2843 is_encoder_decoder=self.config.is_encoder_decoder,
2844 **model_kwargs,
2845 )
2847 # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2848 result = self._sample(
2849 input_ids,
2850 logits_processor=prepared_logits_processor,
2851 stopping_criteria=prepared_stopping_criteria,
2852 generation_config=generation_config,
2853 synced_gpus=synced_gpus,
2854 streamer=streamer,
2855 **model_kwargs,
2856 )
2858 elif generation_mode in (
2859 GenerationMode.BEAM_SAMPLE,
2860 GenerationMode.BEAM_SEARCH,
2861 ):
2862 # 11. interleave input_ids with `num_beams` additional sequences per batch
2863 input_ids, model_kwargs = self._expand_inputs_for_generation(
2864 input_ids=input_ids,
2865 expand_size=generation_config.num_beams,
2866 is_encoder_decoder=self.config.is_encoder_decoder,
2867 **model_kwargs,
2868 )
File ~/**/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:4020, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
4018 probs = nn.functional.softmax(next_token_scores, dim=-1)
4019 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 4020 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
4021 else:
4022 next_tokens = torch.argmax(next_token_scores, dim=-1)
RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
My code is as follows:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
MODEL_PATH = "/data/models/GLM-4-32B-0414"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
quant = {
'torch_dtype': torch.float16,
'quantization_config': BitsAndBytesConfig(
load_in_8bit=True
)
}
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, **quant)
message = [
{"role": "system", "content": "You are a useful AI assistant"},
{"role": "user", "content": "Hello"}
]
inputs = tokenizer.apply_chat_template(
message,
return_tensors="pt",
add_generation_prompt=True,
return_dict=True,
).to(model.device)
generate_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"max_new_tokens": 1024,
"do_sample": True,
"top_p": 0.95,
"top_k": 40,
"temperature": 0.6,
}
out = model.generate(**generate_kwargs)
generate_resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:-1], skip_special_tokens=False)
stop_sequence = tokenizer.decode(out[0][-1:], skip_speical_tokens=False)
print(generate_resp)
This comment has been hidden (marked as Off-Topic)
chengfy
changed discussion status to
closed
chengfy
changed discussion status to
open