Ubuntu commited on
Commit
ad4670f
·
1 Parent(s): b507cf2

added checkpointing

Browse files
Files changed (1) hide show
  1. GRPO.py +5 -12
GRPO.py CHANGED
@@ -120,13 +120,14 @@ def reward_func(completions, **kwargs):
120
  # Configure GRPO training
121
  training_args = GRPOConfig(
122
  output_dir="phi2-grpo-qlora",
123
- num_train_epochs=1, # Reduced from 3 to 1
124
  per_device_train_batch_size=2,
125
- gradient_accumulation_steps=4, # Reduced from 16 to 4
126
  gradient_checkpointing=True,
127
  learning_rate=5e-6,
128
  logging_steps=10,
129
- save_steps=100,
 
130
  fp16=True,
131
  remove_unused_columns=False,
132
  report_to="none",
@@ -134,14 +135,6 @@ training_args = GRPOConfig(
134
  lr_scheduler_type="cosine",
135
  warmup_ratio=0.1,
136
  num_generations=2,
137
- max_length=256, # Added to limit generation length
138
- generation_kwargs={ # Added to control generation
139
- "max_new_tokens": 128,
140
- "pad_token_id": tokenizer.eos_token_id,
141
- "do_sample": False,
142
- },
143
- logging_first_step=True, # Log first step metrics
144
- logging_nan_inf_filter=False, # Show all warnings
145
  )
146
 
147
  # Initialize the GRPO trainer
@@ -156,7 +149,7 @@ trainer = GRPOTrainer(
156
  trainer.tokenizer = tokenizer
157
 
158
  # Start training
159
- trainer.train()
160
 
161
  # Save the final model
162
  trainer.save_model("phi2-grpo-qlora-final")
 
120
  # Configure GRPO training
121
  training_args = GRPOConfig(
122
  output_dir="phi2-grpo-qlora",
123
+ num_train_epochs=1,
124
  per_device_train_batch_size=2,
125
+ gradient_accumulation_steps=4,
126
  gradient_checkpointing=True,
127
  learning_rate=5e-6,
128
  logging_steps=10,
129
+ save_steps=10, # Save every 10 steps
130
+ save_total_limit=1, # Keep only 1 checkpoint (overwrite previous ones)
131
  fp16=True,
132
  remove_unused_columns=False,
133
  report_to="none",
 
135
  lr_scheduler_type="cosine",
136
  warmup_ratio=0.1,
137
  num_generations=2,
 
 
 
 
 
 
 
 
138
  )
139
 
140
  # Initialize the GRPO trainer
 
149
  trainer.tokenizer = tokenizer
150
 
151
  # Start training
152
+ trainer.train(resume_from_checkpoint=True) # Resume from the latest checkpoint
153
 
154
  # Save the final model
155
  trainer.save_model("phi2-grpo-qlora-final")