{ "results": { "gsm8k": { "exact_match,get-answer": 0.7414708112206216, "exact_match_stderr,get-answer": 0.012059911372516116, "alias": "gsm8k" } }, "configs": { "gsm8k": { "task": "gsm8k", "group": [ "math_word_problems" ], "dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/gsm8k", "dataset_name": "main", "training_split": "train", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{answer}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 5, "metric_list": [ { "metric": "exact_match", "aggregation": "mean", "higher_is_better": true, "ignore_case": true, "ignore_punctuation": false, "regexes_to_ignore": [ ",", "\\$", "(?s).*#### " ] } ], "output_type": "generate_until", "generation_kwargs": { "until": [ "\n\n", "Question:" ], "do_sample": false, "temperature": 0.0 }, "repeats": 1, "filter_list": [ { "name": "get-answer", "filter": [ { "function": "regex", "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 2.0 } } }, "versions": { "gsm8k": 2.0 }, "n-shot": { "gsm8k": 5 }, "config": { "model": "vllm", "model_args": "pretrained=/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/Oasis,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1,max_model_len=4096", "batch_size": "auto:128", "batch_sizes": [], "device": "cuda", "use_cache": "/lustre07/scratch/gagan30/arocr/cache/", "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": null }