Check QUESTION_TEMPLATE! kl: 0.01 lr: 1e-6 epoch: 1 with cold-start data noise = torch.randn_like(advantages) * 0.02 advantages = advantages + noise def format_reward(completions, **kwargs): pattern = r".*?\s*.*?" completion_contents = [completion[0]["content"] for completion in completions] reward = [] for content in completion_contents: for_re = 0.0 if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content): for_re += 0.5 think = extract_first_think_answer(content) for_re += min(len(think) / 1200, 1) * 0.5 reward.append(for_re) return reward