Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel | |
from accelerate import init_empty_weights, load_checkpoint_and_dispatch | |
import os | |
# ========== CONFIG ========== | |
base_model_name = "meta-llama/Llama-2-7b-hf" # Replace with a valid model identifier | |
adapter_path = "./lora_adapter" # Replace with the actual path to LoRA adapter | |
offload_folder = "./offload_dir" # Directory for offloading weights | |
hf_token = "your_huggingface_token" # Replace with your Hugging Face token (if needed) | |
# ============================ | |
# Create offload folder if it doesn't exist | |
os.makedirs(offload_folder, exist_ok=True) | |
# Load tokenizer | |
print("🔄 Loading tokenizer...") | |
try: | |
tokenizer = AutoTokenizer.from_pretrained( | |
base_model_name, | |
trust_remote_code=True, | |
token=hf_token if hf_token else None | |
) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
except Exception as e: | |
print(f"Error loading tokenizer: {e}") | |
exit(1) | |
# Initialize empty model to avoid high RAM usage | |
print("🔄 Initializing empty model...") | |
try: | |
with init_empty_weights(): | |
base_model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
trust_remote_code=True, | |
token=hf_token if hf_token else None | |
) | |
except Exception as e: | |
print(f"Error initializing model: {e}") | |
exit(1) | |
# Load checkpoint with disk offload | |
print("📦 Loading model checkpoint with disk offload...") | |
try: | |
base_model = load_checkpoint_and_dispatch( | |
base_model, | |
base_model_name, | |
device_map="auto", | |
offload_folder=offload_folder, | |
offload_state_dict=True | |
) | |
except Exception as e: | |
print(f"Error loading checkpoint: {e}") | |
exit(1) | |
# Load LoRA adapter | |
print("🔌 Loading LoRA adapter...") | |
try: | |
model = PeftModel.from_pretrained( | |
base_model, | |
adapter_path, | |
is_trainable=False, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
) | |
except Exception as e: | |
print(f"Error loading LoRA adapter: {e}") | |
exit(1) | |
model.eval() | |
print("✅ Model loaded successfully!") | |
# Generate function | |
def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9): | |
try: | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_length=max_length, | |
temperature=temperature, | |
top_p=top_p, | |
do_sample=True, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
except Exception as e: | |
return f"Error generating text: {e}" | |
# Gradio interface | |
def gradio_interface(prompt, max_length, temperature, top_p): | |
return generate_text(prompt, max_length, temperature, top_p) | |
demo = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.Textbox(label="Prompt", placeholder="Nhập câu hỏi của bạn..."), | |
gr.Slider(50, 500, value=200, step=10, label="Max length"), | |
gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature"), | |
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") | |
], | |
outputs=gr.Textbox(label="Generated Text"), | |
title="LoRA + Disk Offload Chatbot", | |
description="Chạy model LoRA với hỗ trợ disk offload khi thiếu RAM." | |
) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) |