BitsAndBytes quantization inference error

#5
by chengfy - opened

BitsAndBytes 8-bit quantization is giving me errors (4-bit errors are similar). Here's my error log:

../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[2], line 22
      6 inputs = tokenizer.apply_chat_template(
      7     message,
      8     return_tensors="pt",
      9     add_generation_prompt=True,
     10     return_dict=True,
     11 ).to(model.device)
     13 generate_kwargs = {
     14     "input_ids": inputs["input_ids"],
     15     "attention_mask": inputs["attention_mask"],
   (...)     20     "temperature": 0.6,
     21 }
---> 22 out = model.generate(**generate_kwargs)
     23 generate_resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:-1], skip_special_tokens=False)
     24 stop_sequence = tokenizer.decode(out[0][-1:], skip_speical_tokens=False)

File ~/**/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/**/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:2848, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, **kwargs)
   2840     input_ids, model_kwargs = self._expand_inputs_for_generation(
   2841         input_ids=input_ids,
   2842         expand_size=generation_config.num_return_sequences,
   2843         is_encoder_decoder=self.config.is_encoder_decoder,
   2844         **model_kwargs,
   2845     )
   2847     # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2848     result = self._sample(
   2849         input_ids,
   2850         logits_processor=prepared_logits_processor,
   2851         stopping_criteria=prepared_stopping_criteria,
   2852         generation_config=generation_config,
   2853         synced_gpus=synced_gpus,
   2854         streamer=streamer,
   2855         **model_kwargs,
   2856     )
   2858 elif generation_mode in (
   2859     GenerationMode.BEAM_SAMPLE,
   2860     GenerationMode.BEAM_SEARCH,
   2861 ):
   2862     # 11. interleave input_ids with `num_beams` additional sequences per batch
   2863     input_ids, model_kwargs = self._expand_inputs_for_generation(
   2864         input_ids=input_ids,
   2865         expand_size=generation_config.num_beams,
   2866         is_encoder_decoder=self.config.is_encoder_decoder,
   2867         **model_kwargs,
   2868     )

File ~/**/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:4020, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
   4018     probs = nn.functional.softmax(next_token_scores, dim=-1)
   4019     # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 4020     next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
   4021 else:
   4022     next_tokens = torch.argmax(next_token_scores, dim=-1)

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

My code is as follows:

import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

MODEL_PATH = "/data/models/GLM-4-32B-0414"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
quant = {
    'torch_dtype': torch.float16,
    'quantization_config': BitsAndBytesConfig(
        load_in_8bit=True
    )
}
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, **quant)

message = [
    {"role": "system", "content": "You are a useful AI assistant"},
    {"role": "user", "content": "Hello"}
]

inputs = tokenizer.apply_chat_template(
    message,
    return_tensors="pt",
    add_generation_prompt=True,
    return_dict=True,
).to(model.device)

generate_kwargs = {
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "max_new_tokens": 1024,
    "do_sample": True,
    "top_p": 0.95,
    "top_k": 40,
    "temperature": 0.6,
}
out = model.generate(**generate_kwargs)
generate_resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:-1], skip_special_tokens=False)
stop_sequence = tokenizer.decode(out[0][-1:], skip_speical_tokens=False)
print(generate_resp)
This comment has been hidden (marked as Off-Topic)
chengfy changed discussion status to closed
chengfy changed discussion status to open
Your need to confirm your account before you can post a new comment.

Sign up or log in to comment