|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 500, |
|
"global_step": 156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 4.46052360534668, |
|
"learning_rate": 0.0, |
|
"loss": 1.1926, |
|
"mean_token_accuracy": 0.7001730054616928, |
|
"num_tokens": 14640.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 3.8626222610473633, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.3138, |
|
"mean_token_accuracy": 0.6548218131065369, |
|
"num_tokens": 27693.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 3.952249050140381, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.3569, |
|
"mean_token_accuracy": 0.6562561392784119, |
|
"num_tokens": 40525.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 4.109567165374756, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 1.3472, |
|
"mean_token_accuracy": 0.656329445540905, |
|
"num_tokens": 55822.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 4.119779586791992, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.3473, |
|
"mean_token_accuracy": 0.6558413058519363, |
|
"num_tokens": 70670.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 3.3819665908813477, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.4108, |
|
"mean_token_accuracy": 0.6545125097036362, |
|
"num_tokens": 84335.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 3.7010326385498047, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.2498, |
|
"mean_token_accuracy": 0.6681988388299942, |
|
"num_tokens": 95857.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 3.496901035308838, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 1.2894, |
|
"mean_token_accuracy": 0.6635143309831619, |
|
"num_tokens": 109806.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 3.2937991619110107, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2092, |
|
"mean_token_accuracy": 0.6767952516674995, |
|
"num_tokens": 125631.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 6.788699626922607, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4017, |
|
"mean_token_accuracy": 0.6475312933325768, |
|
"num_tokens": 140029.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 8.16016674041748, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4182, |
|
"mean_token_accuracy": 0.6417016983032227, |
|
"num_tokens": 153863.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 7.313711643218994, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2493, |
|
"mean_token_accuracy": 0.6722374185919762, |
|
"num_tokens": 169366.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 7.297321319580078, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2984, |
|
"mean_token_accuracy": 0.6654682904481888, |
|
"num_tokens": 182285.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 5.980355262756348, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3728, |
|
"mean_token_accuracy": 0.6474195793271065, |
|
"num_tokens": 200009.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 5.341892242431641, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1786, |
|
"mean_token_accuracy": 0.6898260414600372, |
|
"num_tokens": 217172.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 5.044528961181641, |
|
"learning_rate": 5e-06, |
|
"loss": 1.473, |
|
"mean_token_accuracy": 0.6259094700217247, |
|
"num_tokens": 232308.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 4.829463005065918, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4725, |
|
"mean_token_accuracy": 0.6119906902313232, |
|
"num_tokens": 243772.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 3.969775676727295, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2455, |
|
"mean_token_accuracy": 0.6761033684015274, |
|
"num_tokens": 260674.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 4.059275150299072, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3388, |
|
"mean_token_accuracy": 0.6495725736021996, |
|
"num_tokens": 276352.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.6464030742645264, |
|
"learning_rate": 5e-06, |
|
"loss": 1.277, |
|
"mean_token_accuracy": 0.6567010655999184, |
|
"num_tokens": 291765.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 4.572549819946289, |
|
"learning_rate": 5e-06, |
|
"loss": 1.349, |
|
"mean_token_accuracy": 0.6503036767244339, |
|
"num_tokens": 303401.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 3.3478147983551025, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3734, |
|
"mean_token_accuracy": 0.657492570579052, |
|
"num_tokens": 318861.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 3.8337855339050293, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2852, |
|
"mean_token_accuracy": 0.6548745334148407, |
|
"num_tokens": 330566.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 4.150561809539795, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2668, |
|
"mean_token_accuracy": 0.6780209168791771, |
|
"num_tokens": 341999.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.090829849243164, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3751, |
|
"mean_token_accuracy": 0.6607428789138794, |
|
"num_tokens": 358011.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 4.033487319946289, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2884, |
|
"mean_token_accuracy": 0.6732476428151131, |
|
"num_tokens": 371643.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 3.5941057205200195, |
|
"learning_rate": 5e-06, |
|
"loss": 1.047, |
|
"mean_token_accuracy": 0.7033858001232147, |
|
"num_tokens": 383610.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 3.3021464347839355, |
|
"learning_rate": 5e-06, |
|
"loss": 1.233, |
|
"mean_token_accuracy": 0.6834521070122719, |
|
"num_tokens": 398590.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 4.259124279022217, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2961, |
|
"mean_token_accuracy": 0.6622931659221649, |
|
"num_tokens": 409575.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 3.220076322555542, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0836, |
|
"mean_token_accuracy": 0.7075582221150398, |
|
"num_tokens": 423461.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 3.3759732246398926, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2671, |
|
"mean_token_accuracy": 0.6733162626624107, |
|
"num_tokens": 439184.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 13.060998916625977, |
|
"learning_rate": 5e-06, |
|
"loss": 1.263, |
|
"mean_token_accuracy": 0.6846785917878151, |
|
"num_tokens": 451887.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 3.4759275913238525, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2905, |
|
"mean_token_accuracy": 0.6659640669822693, |
|
"num_tokens": 466293.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 3.615352153778076, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3252, |
|
"mean_token_accuracy": 0.6587846502661705, |
|
"num_tokens": 482281.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 3.8253462314605713, |
|
"learning_rate": 5e-06, |
|
"loss": 1.284, |
|
"mean_token_accuracy": 0.6593102514743805, |
|
"num_tokens": 494866.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 3.8242146968841553, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4445, |
|
"mean_token_accuracy": 0.6356016099452972, |
|
"num_tokens": 509737.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 3.3142733573913574, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3622, |
|
"mean_token_accuracy": 0.6442921012639999, |
|
"num_tokens": 524987.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 3.3738458156585693, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5074, |
|
"mean_token_accuracy": 0.611025758087635, |
|
"num_tokens": 540515.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 3.5873780250549316, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2594, |
|
"mean_token_accuracy": 0.672322042286396, |
|
"num_tokens": 554108.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 3.3734376430511475, |
|
"learning_rate": 5e-06, |
|
"loss": 1.386, |
|
"mean_token_accuracy": 0.6393688693642616, |
|
"num_tokens": 570067.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 3.8531699180603027, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2987, |
|
"mean_token_accuracy": 0.6596419736742973, |
|
"num_tokens": 583435.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 3.7436106204986572, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3517, |
|
"mean_token_accuracy": 0.6706470102071762, |
|
"num_tokens": 596105.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 3.717897653579712, |
|
"learning_rate": 5e-06, |
|
"loss": 1.147, |
|
"mean_token_accuracy": 0.7020372673869133, |
|
"num_tokens": 610310.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 3.6992413997650146, |
|
"learning_rate": 5e-06, |
|
"loss": 1.19, |
|
"mean_token_accuracy": 0.6930102109909058, |
|
"num_tokens": 624612.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 3.5740365982055664, |
|
"learning_rate": 5e-06, |
|
"loss": 1.368, |
|
"mean_token_accuracy": 0.6361823007464409, |
|
"num_tokens": 637714.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 3.959993600845337, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2512, |
|
"mean_token_accuracy": 0.6959773004055023, |
|
"num_tokens": 650333.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 3.906562089920044, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2867, |
|
"mean_token_accuracy": 0.6637914702296257, |
|
"num_tokens": 664887.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 3.9186768531799316, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3578, |
|
"mean_token_accuracy": 0.6380120068788528, |
|
"num_tokens": 676568.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 3.360166549682617, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0643, |
|
"mean_token_accuracy": 0.7167638763785362, |
|
"num_tokens": 690799.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.504518508911133, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2447, |
|
"mean_token_accuracy": 0.6620298400521278, |
|
"num_tokens": 706489.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 3.65311861038208, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4979, |
|
"mean_token_accuracy": 0.6288957595825195, |
|
"num_tokens": 721499.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 3.348665475845337, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3043, |
|
"mean_token_accuracy": 0.6755396276712418, |
|
"num_tokens": 738238.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 3.7475781440734863, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3655, |
|
"mean_token_accuracy": 0.6580836474895477, |
|
"num_tokens": 750933.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 3.65537691116333, |
|
"learning_rate": 5e-06, |
|
"loss": 1.428, |
|
"mean_token_accuracy": 0.6435032188892365, |
|
"num_tokens": 766190.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 3.4737765789031982, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3459, |
|
"mean_token_accuracy": 0.6618407666683197, |
|
"num_tokens": 781058.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 3.7944328784942627, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4812, |
|
"mean_token_accuracy": 0.6265006363391876, |
|
"num_tokens": 793591.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 3.4754698276519775, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1157, |
|
"mean_token_accuracy": 0.6880814656615257, |
|
"num_tokens": 807562.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 3.345503568649292, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2552, |
|
"mean_token_accuracy": 0.6683396399021149, |
|
"num_tokens": 822029.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 3.702852487564087, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2624, |
|
"mean_token_accuracy": 0.6913647577166557, |
|
"num_tokens": 833999.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 3.169661045074463, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3735, |
|
"mean_token_accuracy": 0.6340024396777153, |
|
"num_tokens": 848952.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 3.8648996353149414, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2589, |
|
"mean_token_accuracy": 0.6712265312671661, |
|
"num_tokens": 865078.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 3.6297190189361572, |
|
"learning_rate": 5e-06, |
|
"loss": 1.228, |
|
"mean_token_accuracy": 0.6683308631181717, |
|
"num_tokens": 877815.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 3.3031067848205566, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3284, |
|
"mean_token_accuracy": 0.6419568583369255, |
|
"num_tokens": 891433.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 3.6126585006713867, |
|
"learning_rate": 5e-06, |
|
"loss": 1.327, |
|
"mean_token_accuracy": 0.6622776314616203, |
|
"num_tokens": 904268.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 3.407878875732422, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3261, |
|
"mean_token_accuracy": 0.6527406051754951, |
|
"num_tokens": 919814.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 3.381817102432251, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2016, |
|
"mean_token_accuracy": 0.674388200044632, |
|
"num_tokens": 933076.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 3.4750583171844482, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2737, |
|
"mean_token_accuracy": 0.6750325560569763, |
|
"num_tokens": 947949.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 3.401789426803589, |
|
"learning_rate": 5e-06, |
|
"loss": 1.443, |
|
"mean_token_accuracy": 0.6314026713371277, |
|
"num_tokens": 963300.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 3.7318997383117676, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3147, |
|
"mean_token_accuracy": 0.6670505329966545, |
|
"num_tokens": 978591.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 3.5171058177948, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2636, |
|
"mean_token_accuracy": 0.6640981435775757, |
|
"num_tokens": 995710.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 3.5986971855163574, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4008, |
|
"mean_token_accuracy": 0.6460948511958122, |
|
"num_tokens": 1008761.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 3.790513753890991, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0588, |
|
"mean_token_accuracy": 0.7220645099878311, |
|
"num_tokens": 1022118.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 3.2633607387542725, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4292, |
|
"mean_token_accuracy": 0.633380301296711, |
|
"num_tokens": 1036809.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 4.51404333114624, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2447, |
|
"mean_token_accuracy": 0.6805145666003227, |
|
"num_tokens": 1051135.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.9093401432037354, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2158, |
|
"mean_token_accuracy": 0.6765370890498161, |
|
"num_tokens": 1066566.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 3.798604726791382, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2877, |
|
"mean_token_accuracy": 0.6586913466453552, |
|
"num_tokens": 1080474.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 3.8617875576019287, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3414, |
|
"mean_token_accuracy": 0.6375375539064407, |
|
"num_tokens": 1092414.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 3.4888036251068115, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3153, |
|
"mean_token_accuracy": 0.6624401807785034, |
|
"num_tokens": 1105750.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 4.31488037109375, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5732, |
|
"mean_token_accuracy": 0.631282813847065, |
|
"num_tokens": 1119046.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 3.053265333175659, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1599, |
|
"mean_token_accuracy": 0.6818074658513069, |
|
"num_tokens": 1135088.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 3.616725444793701, |
|
"learning_rate": 5e-06, |
|
"loss": 1.43, |
|
"mean_token_accuracy": 0.6406295001506805, |
|
"num_tokens": 1151156.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 4.136235237121582, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0219, |
|
"mean_token_accuracy": 0.730844259262085, |
|
"num_tokens": 1167434.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 3.338418483734131, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2563, |
|
"mean_token_accuracy": 0.6696086227893829, |
|
"num_tokens": 1180703.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 4.320113658905029, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3359, |
|
"mean_token_accuracy": 0.6449860632419586, |
|
"num_tokens": 1192466.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 3.6109604835510254, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3403, |
|
"mean_token_accuracy": 0.6509855315089226, |
|
"num_tokens": 1207465.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 3.9164552688598633, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2705, |
|
"mean_token_accuracy": 0.657896488904953, |
|
"num_tokens": 1218611.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 3.731842279434204, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2964, |
|
"mean_token_accuracy": 0.6592853665351868, |
|
"num_tokens": 1231942.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 3.651313304901123, |
|
"learning_rate": 5e-06, |
|
"loss": 1.36, |
|
"mean_token_accuracy": 0.6576117277145386, |
|
"num_tokens": 1246108.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 3.089268445968628, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2009, |
|
"mean_token_accuracy": 0.6747561246156693, |
|
"num_tokens": 1261356.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 3.238022565841675, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3602, |
|
"mean_token_accuracy": 0.6399379670619965, |
|
"num_tokens": 1277200.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 3.467700481414795, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3403, |
|
"mean_token_accuracy": 0.6624496802687645, |
|
"num_tokens": 1292186.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 3.325700283050537, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2627, |
|
"mean_token_accuracy": 0.6559372171759605, |
|
"num_tokens": 1304719.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 3.621305227279663, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1766, |
|
"mean_token_accuracy": 0.6822386905550957, |
|
"num_tokens": 1318790.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 3.729935884475708, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3674, |
|
"mean_token_accuracy": 0.645961195230484, |
|
"num_tokens": 1329771.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 3.6264734268188477, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1152, |
|
"mean_token_accuracy": 0.687596932053566, |
|
"num_tokens": 1342886.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 3.485283136367798, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4425, |
|
"mean_token_accuracy": 0.6406073719263077, |
|
"num_tokens": 1358395.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 3.9530715942382812, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2683, |
|
"mean_token_accuracy": 0.6643862873315811, |
|
"num_tokens": 1371799.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 4.173793315887451, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2914, |
|
"mean_token_accuracy": 0.660434328019619, |
|
"num_tokens": 1383470.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 3.614288330078125, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2038, |
|
"mean_token_accuracy": 0.6892227753996849, |
|
"num_tokens": 1400378.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.9820358753204346, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1759, |
|
"mean_token_accuracy": 0.6901198551058769, |
|
"num_tokens": 1417726.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 3.3349997997283936, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3748, |
|
"mean_token_accuracy": 0.6374614164233208, |
|
"num_tokens": 1433643.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 3.2100603580474854, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3996, |
|
"mean_token_accuracy": 0.6497422009706497, |
|
"num_tokens": 1449613.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 3.778851270675659, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0333, |
|
"mean_token_accuracy": 0.7377151399850845, |
|
"num_tokens": 1462451.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 3.1992948055267334, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3104, |
|
"mean_token_accuracy": 0.6540233194828033, |
|
"num_tokens": 1476702.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 3.510753631591797, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4638, |
|
"mean_token_accuracy": 0.6315533146262169, |
|
"num_tokens": 1491797.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 3.976029396057129, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5173, |
|
"mean_token_accuracy": 0.6174216568470001, |
|
"num_tokens": 1506937.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 4.135453224182129, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2671, |
|
"mean_token_accuracy": 0.6688078567385674, |
|
"num_tokens": 1517912.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 3.417698621749878, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3911, |
|
"mean_token_accuracy": 0.653182253241539, |
|
"num_tokens": 1534828.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 3.1860220432281494, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1702, |
|
"mean_token_accuracy": 0.6850942298769951, |
|
"num_tokens": 1549388.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 4.109538555145264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9738, |
|
"mean_token_accuracy": 0.7242774963378906, |
|
"num_tokens": 1561336.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 3.6836867332458496, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1749, |
|
"mean_token_accuracy": 0.6976689174771309, |
|
"num_tokens": 1576039.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 3.6196529865264893, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2509, |
|
"mean_token_accuracy": 0.6862899139523506, |
|
"num_tokens": 1590356.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 3.6636641025543213, |
|
"learning_rate": 5e-06, |
|
"loss": 1.222, |
|
"mean_token_accuracy": 0.6742222532629967, |
|
"num_tokens": 1603828.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 3.1560022830963135, |
|
"learning_rate": 5e-06, |
|
"loss": 1.238, |
|
"mean_token_accuracy": 0.6736825779080391, |
|
"num_tokens": 1621001.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 3.1845784187316895, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3708, |
|
"mean_token_accuracy": 0.6455038785934448, |
|
"num_tokens": 1636499.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 3.5999317169189453, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1897, |
|
"mean_token_accuracy": 0.6913699880242348, |
|
"num_tokens": 1649043.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 3.9774956703186035, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0401, |
|
"mean_token_accuracy": 0.7169342339038849, |
|
"num_tokens": 1663873.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 3.3667006492614746, |
|
"learning_rate": 5e-06, |
|
"loss": 1.389, |
|
"mean_token_accuracy": 0.6400957256555557, |
|
"num_tokens": 1679667.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 3.5805838108062744, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3896, |
|
"mean_token_accuracy": 0.6595878526568413, |
|
"num_tokens": 1695115.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 3.8038456439971924, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4199, |
|
"mean_token_accuracy": 0.6296796873211861, |
|
"num_tokens": 1707134.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 3.626406669616699, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2602, |
|
"mean_token_accuracy": 0.6764451712369919, |
|
"num_tokens": 1720062.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 4.185618877410889, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4208, |
|
"mean_token_accuracy": 0.6561515182256699, |
|
"num_tokens": 1731658.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 3.6125693321228027, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1589, |
|
"mean_token_accuracy": 0.6781378239393234, |
|
"num_tokens": 1747033.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 4.1037373542785645, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2256, |
|
"mean_token_accuracy": 0.6652502194046974, |
|
"num_tokens": 1758454.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.3770744800567627, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2673, |
|
"mean_token_accuracy": 0.6793160140514374, |
|
"num_tokens": 1773549.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 3.3607754707336426, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2752, |
|
"mean_token_accuracy": 0.6853690519928932, |
|
"num_tokens": 1789080.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 3.806748390197754, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1944, |
|
"mean_token_accuracy": 0.6872068718075752, |
|
"num_tokens": 1801796.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 3.8479700088500977, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3977, |
|
"mean_token_accuracy": 0.636077530682087, |
|
"num_tokens": 1814242.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 3.4976117610931396, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1139, |
|
"mean_token_accuracy": 0.697039358317852, |
|
"num_tokens": 1826839.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 3.545767307281494, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4587, |
|
"mean_token_accuracy": 0.6343350037932396, |
|
"num_tokens": 1840704.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 3.3037948608398438, |
|
"learning_rate": 5e-06, |
|
"loss": 1.051, |
|
"mean_token_accuracy": 0.7136509492993355, |
|
"num_tokens": 1854753.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 3.749946117401123, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5613, |
|
"mean_token_accuracy": 0.6120798960328102, |
|
"num_tokens": 1868075.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 3.2564704418182373, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5152, |
|
"mean_token_accuracy": 0.6301259547472, |
|
"num_tokens": 1883671.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 3.6490490436553955, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3408, |
|
"mean_token_accuracy": 0.657665990293026, |
|
"num_tokens": 1895662.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 3.407827615737915, |
|
"learning_rate": 5e-06, |
|
"loss": 1.6605, |
|
"mean_token_accuracy": 0.5928493365645409, |
|
"num_tokens": 1910847.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 4.415482521057129, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1946, |
|
"mean_token_accuracy": 0.6526624895632267, |
|
"num_tokens": 1925173.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 4.364404201507568, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3597, |
|
"mean_token_accuracy": 0.6597945764660835, |
|
"num_tokens": 1938507.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 3.498443603515625, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2335, |
|
"mean_token_accuracy": 0.6793266087770462, |
|
"num_tokens": 1955459.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 3.321516275405884, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1991, |
|
"mean_token_accuracy": 0.6588682383298874, |
|
"num_tokens": 1970350.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 3.556004524230957, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1146, |
|
"mean_token_accuracy": 0.7137203514575958, |
|
"num_tokens": 1984042.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 3.075051784515381, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2796, |
|
"mean_token_accuracy": 0.6648645251989365, |
|
"num_tokens": 2000795.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 3.5248730182647705, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2528, |
|
"mean_token_accuracy": 0.6755008921027184, |
|
"num_tokens": 2015306.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 3.545891284942627, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5914, |
|
"mean_token_accuracy": 0.6275190636515617, |
|
"num_tokens": 2030477.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 3.4129698276519775, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1794, |
|
"mean_token_accuracy": 0.6718302294611931, |
|
"num_tokens": 2045094.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 4.104367733001709, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2672, |
|
"mean_token_accuracy": 0.680908165872097, |
|
"num_tokens": 2059572.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 3.316425323486328, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4394, |
|
"mean_token_accuracy": 0.6278339251875877, |
|
"num_tokens": 2074507.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 3.6546125411987305, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1124, |
|
"mean_token_accuracy": 0.6924281641840935, |
|
"num_tokens": 2087040.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 3.6782755851745605, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3804, |
|
"mean_token_accuracy": 0.6391346678137779, |
|
"num_tokens": 2101730.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 3.386892080307007, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2437, |
|
"mean_token_accuracy": 0.6677525192499161, |
|
"num_tokens": 2116581.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.5542984008789062, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0982, |
|
"mean_token_accuracy": 0.7068964689970016, |
|
"num_tokens": 2131452.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 3.5495564937591553, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4026, |
|
"mean_token_accuracy": 0.6545470431447029, |
|
"num_tokens": 2147428.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 3.4057626724243164, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3493, |
|
"mean_token_accuracy": 0.6586249023675919, |
|
"num_tokens": 2164711.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 3.815967559814453, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3319, |
|
"mean_token_accuracy": 0.6602702289819717, |
|
"num_tokens": 2177075.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 3.8833365440368652, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3236, |
|
"mean_token_accuracy": 0.6528388634324074, |
|
"num_tokens": 2190015.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 3.010931968688965, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3359, |
|
"mean_token_accuracy": 0.6606268882751465, |
|
"num_tokens": 2208904.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 3.366220474243164, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2807, |
|
"mean_token_accuracy": 0.6731663048267365, |
|
"num_tokens": 2223354.0, |
|
"step": 156 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 156, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6046380687360.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|