|
Check QUESTION_TEMPLATE! |
|
|
|
kl: 0.01 |
|
lr: 1e-6 |
|
epoch: 1 |
|
with cold-start data |
|
|
|
noise = torch.randn_like(advantages) * 0.02 |
|
advantages = advantages + noise |
|
|
|
def format_reward(completions, **kwargs): |
|
pattern = r"<think>.*?</think>\s*<answer>.*?</answer>" |
|
completion_contents = [completion[0]["content"] for completion in completions] |
|
reward = [] |
|
for content in completion_contents: |
|
for_re = 0.0 |
|
if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content): |
|
for_re += 0.5 |
|
think = extract_first_think_answer(content) |
|
for_re += min(len(think) / 1200, 1) * 0.5 |
|
reward.append(for_re) |
|
return reward |