{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "LqFeWyhye38d" }, "outputs": [], "source": [ "!pip install -q -U huggingface_hub peft transformers torch accelerate" ] }, { "cell_type": "code", "source": [ "!nvidia-smi\n" ], "metadata": { "id": "y5FkaLZcfAHm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "from peft import PeftModel, PeftConfig\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n" ], "metadata": { "id": "EKXLttEgf06g" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!huggingface-cli login" ], "metadata": { "id": "Q_8EpxK4gUZI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "peft_model_id = \"dfurman/llama-2-13b-dolphin-peft\"\n", "config = PeftConfig.from_pretrained(peft_model_id)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\n", " config.base_model_name_or_path,\n", " use_auth_token=True\n", ")\n", "tokenizer.pad_token = tokenizer.eos_token\n", "model = AutoModelForCausalLM.from_pretrained(\n", " config.base_model_name_or_path,\n", " torch_dtype=torch.bfloat16,\n", " device_map=\"auto\",\n", " use_auth_token=True,\n", ")\n", "\n", "# Load the Lora model\n", "model = PeftModel.from_pretrained(model, peft_model_id)" ], "metadata": { "id": "AGxrbUqDgD8D" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def llama_generate(\n", " model: AutoModelForCausalLM,\n", " tokenizer: AutoTokenizer,\n", " prompt: str,\n", " max_new_tokens: int = 128,\n", " temperature: int = 1.0,\n", ") -> str:\n", " \"\"\"\n", " Initialize the pipeline\n", " Uses Hugging Face GenerationConfig defaults\n", " https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n", " Args:\n", " model (transformers.AutoModelForCausalLM): Falcon model for text generation\n", " tokenizer (transformers.AutoTokenizer): Tokenizer for model\n", " prompt (str): Prompt for text generation\n", " max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n", " temperature (float, optional): The value used to modulate the next token probabilities.\n", " Defaults to 1.0\n", " \"\"\"\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", " inputs = tokenizer(\n", " [prompt],\n", " return_tensors=\"pt\",\n", " return_token_type_ids=False,\n", " ).to(\n", " device\n", " ) # tokenize inputs, load on device\n", "\n", " # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n", " with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n", " response = model.generate(\n", " **inputs,\n", " max_new_tokens=max_new_tokens,\n", " temperature=temperature,\n", " return_dict_in_generate=True,\n", " eos_token_id=tokenizer.eos_token_id,\n", " pad_token_id=tokenizer.pad_token_id,\n", " )\n", "\n", " decoded_output = tokenizer.decode(\n", " response[\"sequences\"][0],\n", " skip_special_tokens=True,\n", " ) # grab output in natural language\n", "\n", " return decoded_output[len(prompt) :] # remove prompt from output\n" ], "metadata": { "id": "OQD_s1-egFjB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "prompt = \"Your are a helpful AI assistant. Write me a numbered list of things to do in New York City.\\n\"\n", "\n", "response = llama_generate(\n", " model,\n", " tokenizer,\n", " prompt,\n", " max_new_tokens=150,\n", " temperature=0.92,\n", ")\n", "\n", "print(response)" ], "metadata": { "id": "mKXUkc6BgjdL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "JOgPF_UdgnWr" }, "execution_count": null, "outputs": [] } ] }