Check QUESTION_TEMPLATE!
kl: 0.01
lr: 1e-6
epoch: 1
with cold-start data
noise = torch.randn_like(advantages) * 0.02
advantages = advantages + noise
def format_reward(completions, **kwargs):
pattern = r".*?\s*.*?"
completion_contents = [completion[0]["content"] for completion in completions]
reward = []
for content in completion_contents:
for_re = 0.0
if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content):
for_re += 0.5
think = extract_first_think_answer(content)
for_re += min(len(think) / 1200, 1) * 0.5
reward.append(for_re)
return reward