In [None]:
!pip install -q -U huggingface_hub peft transformers torch accelerate

In [None]:
!nvidia-smi


In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


In [None]:
!huggingface-cli login

In [None]:
peft_model_id = "dfurman/llama-2-13b-dolphin-peft"
config = PeftConfig.from_pretrained(peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(
 config.base_model_name_or_path,
 use_auth_token=True
)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
 config.base_model_name_or_path,
 torch_dtype=torch.bfloat16,
 device_map="auto",
 use_auth_token=True,
)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
def llama_generate(
 model: AutoModelForCausalLM,
 tokenizer: AutoTokenizer,
 prompt: str,
 max_new_tokens: int = 128,
 temperature: int = 1.0,
) -> str:
 """
 Initialize the pipeline
 Uses Hugging Face GenerationConfig defaults
 https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig
 Args:
 model (transformers.AutoModelForCausalLM): Falcon model for text generation
 tokenizer (transformers.AutoTokenizer): Tokenizer for model
 prompt (str): Prompt for text generation
 max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
 temperature (float, optional): The value used to modulate the next token probabilities.
 Defaults to 1.0
 """
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

 inputs = tokenizer(
 [prompt],
 return_tensors="pt",
 return_token_type_ids=False,
 ).to(
 device
 ) # tokenize inputs, load on device

 # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.
 with torch.autocast("cuda", dtype=torch.bfloat16):
 response = model.generate(
 **inputs,
 max_new_tokens=max_new_tokens,
 temperature=temperature,
 return_dict_in_generate=True,
 eos_token_id=tokenizer.eos_token_id,
 pad_token_id=tokenizer.pad_token_id,
 )

 decoded_output = tokenizer.decode(
 response["sequences"][0],
 skip_special_tokens=True,
 ) # grab output in natural language

 return decoded_output[len(prompt) :] # remove prompt from output


In [None]:
prompt = "Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\n"

response = llama_generate(
 model,
 tokenizer,
 prompt,
 max_new_tokens=150,
 temperature=0.92,
)

print(response)