File size: 5,454 Bytes
9bbf6f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "A100"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LqFeWyhye38d"
},
"outputs": [],
"source": [
"!pip install -q -U huggingface_hub peft transformers torch accelerate"
]
},
{
"cell_type": "code",
"source": [
"!nvidia-smi\n"
],
"metadata": {
"id": "y5FkaLZcfAHm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from peft import PeftModel, PeftConfig\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n"
],
"metadata": {
"id": "EKXLttEgf06g"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!huggingface-cli login"
],
"metadata": {
"id": "Q_8EpxK4gUZI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n",
"config = PeftConfig.from_pretrained(peft_model_id)\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\n",
" config.base_model_name_or_path,\n",
" use_auth_token=True\n",
")\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" config.base_model_name_or_path,\n",
" torch_dtype=torch.bfloat16,\n",
" device_map=\"auto\",\n",
" use_auth_token=True,\n",
")\n",
"\n",
"# Load the Lora model\n",
"model = PeftModel.from_pretrained(model, peft_model_id)"
],
"metadata": {
"id": "AGxrbUqDgD8D"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def llama_generate(\n",
" model: AutoModelForCausalLM,\n",
" tokenizer: AutoTokenizer,\n",
" prompt: str,\n",
" max_new_tokens: int = 128,\n",
" temperature: int = 1.0,\n",
") -> str:\n",
" \"\"\"\n",
" Initialize the pipeline\n",
" Uses Hugging Face GenerationConfig defaults\n",
" https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
" Args:\n",
" model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
" tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
" prompt (str): Prompt for text generation\n",
" max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
" temperature (float, optional): The value used to modulate the next token probabilities.\n",
" Defaults to 1.0\n",
" \"\"\"\n",
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
" inputs = tokenizer(\n",
" [prompt],\n",
" return_tensors=\"pt\",\n",
" return_token_type_ids=False,\n",
" ).to(\n",
" device\n",
" ) # tokenize inputs, load on device\n",
"\n",
" # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
" with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
" response = model.generate(\n",
" **inputs,\n",
" max_new_tokens=max_new_tokens,\n",
" temperature=temperature,\n",
" return_dict_in_generate=True,\n",
" eos_token_id=tokenizer.eos_token_id,\n",
" pad_token_id=tokenizer.pad_token_id,\n",
" )\n",
"\n",
" decoded_output = tokenizer.decode(\n",
" response[\"sequences\"][0],\n",
" skip_special_tokens=True,\n",
" ) # grab output in natural language\n",
"\n",
" return decoded_output[len(prompt) :] # remove prompt from output\n"
],
"metadata": {
"id": "OQD_s1-egFjB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n",
"\n",
"response = llama_generate(\n",
" model,\n",
" tokenizer,\n",
" prompt,\n",
" max_new_tokens=150,\n",
" temperature=0.92,\n",
")\n",
"\n",
"print(response)"
],
"metadata": {
"id": "mKXUkc6BgjdL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "JOgPF_UdgnWr"
},
"execution_count": null,
"outputs": []
}
]
} |