Ubuntu commited on
Commit
b507cf2
·
1 Parent(s): ace32a8

GRPO working

Browse files
Files changed (1) hide show
  1. GRPO.py +47 -14
GRPO.py CHANGED
@@ -1,6 +1,7 @@
1
  from datasets import load_dataset, Dataset
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from trl import GRPOConfig, GRPOTrainer
 
4
  import torch
5
  import os
6
  from collections import defaultdict
@@ -62,6 +63,10 @@ for tree_id, msgs in conversations.items():
62
  # Convert to Hugging Face dataset format for preference learning
63
  preference_dataset = Dataset.from_list(pairs)
64
 
 
 
 
 
65
  print(f"Created {len(preference_dataset)} preference pairs for GRPO")
66
 
67
  # Debug: Print a sample pair if available
@@ -73,30 +78,51 @@ if len(preference_dataset) > 0:
73
  else:
74
  print("WARNING: No preference pairs were created. Check the dataset structure.")
75
 
76
- # Load model and tokenizer
 
 
 
 
 
 
 
 
77
  model_name = "microsoft/phi-2"
78
  tokenizer = AutoTokenizer.from_pretrained(model_name)
79
  model = AutoModelForCausalLM.from_pretrained(
80
  model_name,
81
- torch_dtype=torch.bfloat16,
82
  device_map="auto"
83
  )
84
 
85
- # Define a reward function that rewards helpful, concise responses
86
- # and penalizes responses similar to rejected ones
87
- def reward_func(completions, **kwargs):
88
- return [len(c.split()) for c in completions] # reward by word count
 
 
 
 
 
 
 
 
 
89
 
90
  # Configure tokenizer for chat format
91
  tokenizer.pad_token = tokenizer.eos_token
92
  tokenizer.padding_side = "left"
93
 
 
 
 
 
94
  # Configure GRPO training
95
  training_args = GRPOConfig(
96
- output_dir="phi2-grpo-openassistant",
97
- num_train_epochs=3,
98
  per_device_train_batch_size=2,
99
- gradient_accumulation_steps=16,
100
  gradient_checkpointing=True,
101
  learning_rate=5e-6,
102
  logging_steps=10,
@@ -107,11 +133,18 @@ training_args = GRPOConfig(
107
  optim="adamw_torch",
108
  lr_scheduler_type="cosine",
109
  warmup_ratio=0.1,
110
- num_generations=2, # Set the desired number of generations per prompt
 
 
 
 
 
 
 
 
111
  )
112
 
113
-
114
- # Initialize the GRPO trainer with preference dataset
115
  trainer = GRPOTrainer(
116
  model=model,
117
  args=training_args,
@@ -126,4 +159,4 @@ trainer.tokenizer = tokenizer
126
  trainer.train()
127
 
128
  # Save the final model
129
- trainer.save_model("phi2-grpo-openassistant-final")
 
1
  from datasets import load_dataset, Dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
  from trl import GRPOConfig, GRPOTrainer
4
+ from peft import LoraConfig, get_peft_model
5
  import torch
6
  import os
7
  from collections import defaultdict
 
63
  # Convert to Hugging Face dataset format for preference learning
64
  preference_dataset = Dataset.from_list(pairs)
65
 
66
+ # Limit dataset size to speed up training (use first 1000 examples)
67
+ if len(preference_dataset) > 1000:
68
+ preference_dataset = preference_dataset.select(range(1000))
69
+
70
  print(f"Created {len(preference_dataset)} preference pairs for GRPO")
71
 
72
  # Debug: Print a sample pair if available
 
78
  else:
79
  print("WARNING: No preference pairs were created. Check the dataset structure.")
80
 
81
+ # Configure quantization for loading the model
82
+ quantization_config = BitsAndBytesConfig(
83
+ load_in_4bit=True,
84
+ bnb_4bit_compute_dtype=torch.float16,
85
+ bnb_4bit_quant_type="nf4",
86
+ bnb_4bit_use_double_quant=True,
87
+ )
88
+
89
+ # Load model and tokenizer with quantization
90
  model_name = "microsoft/phi-2"
91
  tokenizer = AutoTokenizer.from_pretrained(model_name)
92
  model = AutoModelForCausalLM.from_pretrained(
93
  model_name,
94
+ quantization_config=quantization_config,
95
  device_map="auto"
96
  )
97
 
98
+ # Configure LoRA
99
+ peft_config = LoraConfig(
100
+ r=16, # Rank
101
+ lora_alpha=32, # Alpha parameter for LoRA scaling
102
+ lora_dropout=0.05, # Dropout probability for LoRA layers
103
+ bias="none", # Bias type for LoRA
104
+ task_type="CAUSAL_LM", # Task type
105
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
106
+ )
107
+
108
+ # Apply LoRA to the model
109
+ model = get_peft_model(model, peft_config)
110
+ model.print_trainable_parameters() # Print trainable parameters info
111
 
112
  # Configure tokenizer for chat format
113
  tokenizer.pad_token = tokenizer.eos_token
114
  tokenizer.padding_side = "left"
115
 
116
+ # Define a reward function that rewards helpful, concise responses
117
+ def reward_func(completions, **kwargs):
118
+ return [len(c.split()) for c in completions] # reward by word count
119
+
120
  # Configure GRPO training
121
  training_args = GRPOConfig(
122
+ output_dir="phi2-grpo-qlora",
123
+ num_train_epochs=1, # Reduced from 3 to 1
124
  per_device_train_batch_size=2,
125
+ gradient_accumulation_steps=4, # Reduced from 16 to 4
126
  gradient_checkpointing=True,
127
  learning_rate=5e-6,
128
  logging_steps=10,
 
133
  optim="adamw_torch",
134
  lr_scheduler_type="cosine",
135
  warmup_ratio=0.1,
136
+ num_generations=2,
137
+ max_length=256, # Added to limit generation length
138
+ generation_kwargs={ # Added to control generation
139
+ "max_new_tokens": 128,
140
+ "pad_token_id": tokenizer.eos_token_id,
141
+ "do_sample": False,
142
+ },
143
+ logging_first_step=True, # Log first step metrics
144
+ logging_nan_inf_filter=False, # Show all warnings
145
  )
146
 
147
+ # Initialize the GRPO trainer
 
148
  trainer = GRPOTrainer(
149
  model=model,
150
  args=training_args,
 
159
  trainer.train()
160
 
161
  # Save the final model
162
+ trainer.save_model("phi2-grpo-qlora-final")