Ubuntu
commited on
Commit
·
ad4670f
1
Parent(s):
b507cf2
added checkpointing
Browse files
GRPO.py
CHANGED
@@ -120,13 +120,14 @@ def reward_func(completions, **kwargs):
|
|
120 |
# Configure GRPO training
|
121 |
training_args = GRPOConfig(
|
122 |
output_dir="phi2-grpo-qlora",
|
123 |
-
num_train_epochs=1,
|
124 |
per_device_train_batch_size=2,
|
125 |
-
gradient_accumulation_steps=4,
|
126 |
gradient_checkpointing=True,
|
127 |
learning_rate=5e-6,
|
128 |
logging_steps=10,
|
129 |
-
save_steps=
|
|
|
130 |
fp16=True,
|
131 |
remove_unused_columns=False,
|
132 |
report_to="none",
|
@@ -134,14 +135,6 @@ training_args = GRPOConfig(
|
|
134 |
lr_scheduler_type="cosine",
|
135 |
warmup_ratio=0.1,
|
136 |
num_generations=2,
|
137 |
-
max_length=256, # Added to limit generation length
|
138 |
-
generation_kwargs={ # Added to control generation
|
139 |
-
"max_new_tokens": 128,
|
140 |
-
"pad_token_id": tokenizer.eos_token_id,
|
141 |
-
"do_sample": False,
|
142 |
-
},
|
143 |
-
logging_first_step=True, # Log first step metrics
|
144 |
-
logging_nan_inf_filter=False, # Show all warnings
|
145 |
)
|
146 |
|
147 |
# Initialize the GRPO trainer
|
@@ -156,7 +149,7 @@ trainer = GRPOTrainer(
|
|
156 |
trainer.tokenizer = tokenizer
|
157 |
|
158 |
# Start training
|
159 |
-
trainer.train()
|
160 |
|
161 |
# Save the final model
|
162 |
trainer.save_model("phi2-grpo-qlora-final")
|
|
|
120 |
# Configure GRPO training
|
121 |
training_args = GRPOConfig(
|
122 |
output_dir="phi2-grpo-qlora",
|
123 |
+
num_train_epochs=1,
|
124 |
per_device_train_batch_size=2,
|
125 |
+
gradient_accumulation_steps=4,
|
126 |
gradient_checkpointing=True,
|
127 |
learning_rate=5e-6,
|
128 |
logging_steps=10,
|
129 |
+
save_steps=10, # Save every 10 steps
|
130 |
+
save_total_limit=1, # Keep only 1 checkpoint (overwrite previous ones)
|
131 |
fp16=True,
|
132 |
remove_unused_columns=False,
|
133 |
report_to="none",
|
|
|
135 |
lr_scheduler_type="cosine",
|
136 |
warmup_ratio=0.1,
|
137 |
num_generations=2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
|
140 |
# Initialize the GRPO trainer
|
|
|
149 |
trainer.tokenizer = tokenizer
|
150 |
|
151 |
# Start training
|
152 |
+
trainer.train(resume_from_checkpoint=True) # Resume from the latest checkpoint
|
153 |
|
154 |
# Save the final model
|
155 |
trainer.save_model("phi2-grpo-qlora-final")
|