#!/usr/bin/env python3 # gpu_finetune.py import os import sys import torch import logging from pathlib import Path import traceback # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def check_environment(): """Check and report system environment""" logger.info("=== Environment Check ===") logger.info(f"Python version: {sys.version}") logger.info(f"PyTorch version: {torch.__version__}") logger.info(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA version: {torch.version.cuda}") logger.info(f"GPU count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}") logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB") def main(): try: check_environment() logger.info("Importing required packages...") try: from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from trl import SFTTrainer logger.info("✓ All transformers packages imported successfully") except ImportError as e: logger.error(f"Failed to import transformers packages: {e}") logger.error("Please ensure all packages are installed: pip install transformers datasets peft trl") sys.exit(1) # --- Configuration --- MODEL_ID = "google/gemma-3-1b-it" OUTPUT_DIR = "./results" HUB_MODEL_ID = "omark807/gemma3-finetuned-web-accessibility" NUM_TRAIN_EPOCHS = 3 PER_DEVICE_TRAIN_BATCH_SIZE = 2 GRADIENT_ACCUMULATION_STEPS = 4 LEARNING_RATE = 2e-4 SAVE_STEPS = 500 LOGGING_STEPS = 10 MAX_SEQ_LENGTH = 512 # Create output directory Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) logger.info(f"Output directory: {os.path.abspath(OUTPUT_DIR)}") # --- Device Detection and Quantization Config --- if torch.cuda.is_available(): logger.info("🚀 CUDA is available! Configuring for GPU training.") try: from bitsandbytes import BitsAndBytesConfig logger.info("✓ BitsAndBytes imported successfully") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=False, ) model_dtype = torch.bfloat16 fp16_arg = False bf16_arg = True device_map = "auto" optimizer_type = "paged_adamw_8bit" logger.info("✓ 4-bit quantization configured") except ImportError as e: logger.warning(f"BitsAndBytes import failed: {e}") logger.warning("Falling back to standard GPU configuration without quantization") bnb_config = None model_dtype = torch.float16 # Use float16 for GPU without quantization fp16_arg = True bf16_arg = False device_map = {"": 0} optimizer_type = "adamw_torch" else: logger.warning("⚠️ CUDA is NOT available. Using CPU configuration.") logger.warning("Training will be significantly slower!") bnb_config = None model_dtype = torch.float32 fp16_arg = False bf16_arg = False device_map = "cpu" optimizer_type = "adamw_torch" # --- LoRA Configuration --- lora_config = LoraConfig( r=16, lora_alpha=16, target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], bias="none", lora_dropout=0.05, task_type="CAUSAL_LM", ) logger.info("✓ LoRA configuration set") # --- Load Dataset --- logger.info("Loading dataset...") try: ds = load_dataset("omark807/web_a11y_dataset") logger.info(f"✓ Dataset loaded. Train samples: {len(ds['train'])}") sample = ds['train'][0] if 'question' not in sample or 'answer' not in sample: logger.error("Dataset must have 'question' and 'answer' columns") sys.exit(1) except Exception as e: logger.error(f"Failed to load dataset: {e}") logger.error("Check your internet connection and dataset availability") sys.exit(1) # --- Load Tokenizer --- logger.info(f"Loading tokenizer: {MODEL_ID}") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) # Handle tokenizer padding if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" tokenizer.model_max_length = MAX_SEQ_LENGTH logger.info("✓ Tokenizer loaded and configured") except Exception as e: logger.error(f"Failed to load tokenizer: {e}") sys.exit(1) # --- Load Model --- logger.info(f"Loading model: {MODEL_ID}") try: model_kwargs = { "torch_dtype": model_dtype, "device_map": device_map, "trust_remote_code": True, "use_cache": False, } # Add quantization config only if available if bnb_config is not None: model_kwargs["quantization_config"] = bnb_config model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs) # Set pretraining_tp for Gemma if hasattr(model.config, 'pretraining_tp'): model.config.pretraining_tp = 1 logger.info("✓ Model loaded successfully") except Exception as e: logger.error(f"Failed to load model: {e}") logger.error("This might be due to insufficient GPU memory or network issues") sys.exit(1) # --- Prepare Model for Training --- logger.info("Preparing model for training...") try: # Prepare for k-bit training if using quantization if bnb_config is not None: model = prepare_model_for_kbit_training(model) logger.info("✓ Model prepared for k-bit training") # Apply LoRA model = get_peft_model(model, lora_config) logger.info("✓ LoRA applied to model") for name, param in model.named_parameters(): if "lora" in name: param.requires_grad = True elif param.requires_grad: param.requires_grad = False if hasattr(model, 'lm_head'): for param in model.lm_head.parameters(): param.requires_grad = True elif hasattr(model, 'embed_out'): for param in model.embed_out.parameters(): param.requires_grad = True elif hasattr(model, 'base_model') and hasattr(model.base_model, 'lm_head'): for param in model.base_model.lm_head.parameters(): param.requires_grad = True if hasattr(model, 'get_input_embeddings') and model.get_input_embeddings() is not None: model.get_input_embeddings().requires_grad_(False) if hasattr(model, 'get_output_embeddings') and model.get_output_embeddings() is not None: model.get_output_embeddings().requires_grad_(False) model.print_trainable_parameters() # This will reflect the correct trainable params logger.info("✓ Gradient requirements explicitly set for LoRA and LM head") except Exception as e: logger.error(f"Failed to prepare model: {e}") logger.error(f"Full traceback: {traceback.format_exc()}") sys.exit(1) # --- Formatting Function (for pre-tokenization) --- def tokenize_function(examples): formatted_texts = [] for i in range(len(examples["question"])): question = examples["question"][i] answer = examples["answer"][i] formatted_text = f"user\n{question}\nmodel\n{answer}" formatted_texts.append(formatted_text) # Tokenize the formatted texts directly tokenized_inputs = tokenizer( formatted_texts, max_length=MAX_SEQ_LENGTH, truncation=True, padding="max_length", return_tensors="np", ) # Add 'labels' for language modeling training tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() return tokenized_inputs # --- Pre-tokenize the dataset --- logger.info("Pre-tokenizing dataset...") try: tokenized_ds = ds["train"].map( tokenize_function, batched=True, remove_columns=ds["train"].column_names, num_proc=os.cpu_count() or 1, ) logger.info(f"✓ Dataset pre-tokenized. New train samples: {len(tokenized_ds)}") except Exception as e: logger.error(f"Failed to pre-tokenize dataset: {e}") logger.error(f"Full traceback: {traceback.format_exc()}") sys.exit(1) # --- Training Arguments --- training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=NUM_TRAIN_EPOCHS, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, optim=optimizer_type, learning_rate=LEARNING_RATE, fp16=fp16_arg, bf16=bf16_arg, max_grad_norm=0.3, warmup_ratio=0.03, lr_scheduler_type="constant", logging_steps=LOGGING_STEPS, save_steps=SAVE_STEPS, save_total_limit=3, remove_unused_columns=False, push_to_hub=False, hub_model_id=HUB_MODEL_ID, report_to="tensorboard", dataloader_num_workers=0, save_safetensors=True, gradient_checkpointing=False, ) logger.info("✓ Training arguments configured") # --- Initialize Trainer --- logger.info("Initializing SFTTrainer...") try: trainer = SFTTrainer( model=model, train_dataset=tokenized_ds, args=training_args, ) logger.info("✓ SFTTrainer initialized successfully") except Exception as e: logger.error(f"Failed to initialize trainer: {e}") logger.error(f"Full traceback: {traceback.format_exc()}") # Added traceback for debugging sys.exit(1) # --- Start Training --- logger.info("🚀 Starting fine-tuning...") logger.info(f"Training for {NUM_TRAIN_EPOCHS} epochs") logger.info(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}, Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}") logger.info(f"Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}") try: trainer.train() logger.info("🎉 Fine-tuning completed successfully!") except Exception as e: logger.error(f"Training failed: {e}") logger.error(f"Full traceback: {traceback.format_exc()}") sys.exit(1) # --- Save Model --- logger.info("Saving model and tokenizer...") try: trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) logger.info(f"✓ Model saved to: {os.path.abspath(OUTPUT_DIR)}") # Save training info with open(os.path.join(OUTPUT_DIR, "training_info.txt"), "w") as f: f.write(f"Model: {MODEL_ID}\n") f.write(f"Epochs: {NUM_TRAIN_EPOCHS}\n") f.write(f"Learning rate: {LEARNING_RATE}\n") f.write(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}\n") f.write(f"LoRA r: {lora_config.r}\n") f.write(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}\n") f.write(f"Quantization: {bnb_config is not None}\n") logger.info("✅ All done! Model ready for use.") except Exception as e: logger.error(f"Failed to save model: {e}") sys.exit(1) except KeyboardInterrupt: logger.info("Training interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Unexpected error: {e}") logger.error(f"Full traceback: {traceback.format_exc()}") sys.exit(1) if __name__ == "__main__": main()