522H0134-NguyenNhatHuy's picture
Update app.py
59eb4ad verified
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import os
# ========== CONFIG ==========
base_model_name = "meta-llama/Llama-2-7b-hf" # Replace with a valid model identifier
adapter_path = "./lora_adapter" # Replace with the actual path to LoRA adapter
offload_folder = "./offload_dir" # Directory for offloading weights
hf_token = "your_huggingface_token" # Replace with your Hugging Face token (if needed)
# ============================
# Create offload folder if it doesn't exist
os.makedirs(offload_folder, exist_ok=True)
# Load tokenizer
print("🔄 Loading tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(
base_model_name,
trust_remote_code=True,
token=hf_token if hf_token else None
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
except Exception as e:
print(f"Error loading tokenizer: {e}")
exit(1)
# Initialize empty model to avoid high RAM usage
print("🔄 Initializing empty model...")
try:
with init_empty_weights():
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True,
token=hf_token if hf_token else None
)
except Exception as e:
print(f"Error initializing model: {e}")
exit(1)
# Load checkpoint with disk offload
print("📦 Loading model checkpoint with disk offload...")
try:
base_model = load_checkpoint_and_dispatch(
base_model,
base_model_name,
device_map="auto",
offload_folder=offload_folder,
offload_state_dict=True
)
except Exception as e:
print(f"Error loading checkpoint: {e}")
exit(1)
# Load LoRA adapter
print("🔌 Loading LoRA adapter...")
try:
model = PeftModel.from_pretrained(
base_model,
adapter_path,
is_trainable=False,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
except Exception as e:
print(f"Error loading LoRA adapter: {e}")
exit(1)
model.eval()
print("✅ Model loaded successfully!")
# Generate function
def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9):
try:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
return f"Error generating text: {e}"
# Gradio interface
def gradio_interface(prompt, max_length, temperature, top_p):
return generate_text(prompt, max_length, temperature, top_p)
demo = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Prompt", placeholder="Nhập câu hỏi của bạn..."),
gr.Slider(50, 500, value=200, step=10, label="Max length"),
gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
],
outputs=gr.Textbox(label="Generated Text"),
title="LoRA + Disk Offload Chatbot",
description="Chạy model LoRA với hỗ trợ disk offload khi thiếu RAM."
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)