import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from accelerate import init_empty_weights, load_checkpoint_and_dispatch import os # ========== CONFIG ========== base_model_name = "meta-llama/Llama-2-7b-hf" # Replace with a valid model identifier adapter_path = "./lora_adapter" # Replace with the actual path to LoRA adapter offload_folder = "./offload_dir" # Directory for offloading weights hf_token = "your_huggingface_token" # Replace with your Hugging Face token (if needed) # ============================ # Create offload folder if it doesn't exist os.makedirs(offload_folder, exist_ok=True) # Load tokenizer print("🔄 Loading tokenizer...") try: tokenizer = AutoTokenizer.from_pretrained( base_model_name, trust_remote_code=True, token=hf_token if hf_token else None ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token except Exception as e: print(f"Error loading tokenizer: {e}") exit(1) # Initialize empty model to avoid high RAM usage print("🔄 Initializing empty model...") try: with init_empty_weights(): base_model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True, token=hf_token if hf_token else None ) except Exception as e: print(f"Error initializing model: {e}") exit(1) # Load checkpoint with disk offload print("📦 Loading model checkpoint with disk offload...") try: base_model = load_checkpoint_and_dispatch( base_model, base_model_name, device_map="auto", offload_folder=offload_folder, offload_state_dict=True ) except Exception as e: print(f"Error loading checkpoint: {e}") exit(1) # Load LoRA adapter print("🔌 Loading LoRA adapter...") try: model = PeftModel.from_pretrained( base_model, adapter_path, is_trainable=False, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) except Exception as e: print(f"Error loading LoRA adapter: {e}") exit(1) model.eval() print("✅ Model loaded successfully!") # Generate function def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9): try: inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id ) return tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: return f"Error generating text: {e}" # Gradio interface def gradio_interface(prompt, max_length, temperature, top_p): return generate_text(prompt, max_length, temperature, top_p) demo = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox(label="Prompt", placeholder="Nhập câu hỏi của bạn..."), gr.Slider(50, 500, value=200, step=10, label="Max length"), gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") ], outputs=gr.Textbox(label="Generated Text"), title="LoRA + Disk Offload Chatbot", description="Chạy model LoRA với hỗ trợ disk offload khi thiếu RAM." ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)