|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.10020040080160321, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000501002004008016, |
|
"grad_norm": 2.6050267219543457, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.0966, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001002004008016032, |
|
"grad_norm": 2.553297996520996, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.5944, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.001503006012024048, |
|
"grad_norm": 2.3349509239196777, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.5769, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002004008016032064, |
|
"grad_norm": 2.3492772579193115, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.3153, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00250501002004008, |
|
"grad_norm": 1.5965133905410767, |
|
"learning_rate": 0.00015, |
|
"loss": 2.3999, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003006012024048096, |
|
"grad_norm": 1.4280346632003784, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 2.0657, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0035070140280561123, |
|
"grad_norm": 1.726511836051941, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 2.2877, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004008016032064128, |
|
"grad_norm": 1.741253137588501, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 2.2613, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0045090180360721445, |
|
"grad_norm": 1.8103491067886353, |
|
"learning_rate": 0.00027, |
|
"loss": 2.1171, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00501002004008016, |
|
"grad_norm": 1.4325164556503296, |
|
"learning_rate": 0.0003, |
|
"loss": 2.0913, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005511022044088177, |
|
"grad_norm": 1.3550007343292236, |
|
"learning_rate": 0.0002999246609743847, |
|
"loss": 2.1027, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.006012024048096192, |
|
"grad_norm": 1.288711428642273, |
|
"learning_rate": 0.00029984932194876944, |
|
"loss": 2.0211, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006513026052104208, |
|
"grad_norm": 1.4124622344970703, |
|
"learning_rate": 0.00029977398292315414, |
|
"loss": 2.1794, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0070140280561122245, |
|
"grad_norm": 1.2239776849746704, |
|
"learning_rate": 0.0002996986438975389, |
|
"loss": 1.8072, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00751503006012024, |
|
"grad_norm": 1.2423063516616821, |
|
"learning_rate": 0.00029962330487192366, |
|
"loss": 2.1701, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008016032064128256, |
|
"grad_norm": 1.249097228050232, |
|
"learning_rate": 0.00029954796584630836, |
|
"loss": 2.155, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.008517034068136272, |
|
"grad_norm": 1.2091352939605713, |
|
"learning_rate": 0.00029947262682069307, |
|
"loss": 2.1671, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.009018036072144289, |
|
"grad_norm": 1.168344497680664, |
|
"learning_rate": 0.00029939728779507783, |
|
"loss": 1.8644, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.009519038076152305, |
|
"grad_norm": 1.1328169107437134, |
|
"learning_rate": 0.00029932194876946253, |
|
"loss": 2.0983, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01002004008016032, |
|
"grad_norm": 1.081700086593628, |
|
"learning_rate": 0.0002992466097438473, |
|
"loss": 1.9168, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010521042084168337, |
|
"grad_norm": 1.0554951429367065, |
|
"learning_rate": 0.000299171270718232, |
|
"loss": 1.9249, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.011022044088176353, |
|
"grad_norm": 1.1648558378219604, |
|
"learning_rate": 0.00029909593169261676, |
|
"loss": 1.9194, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.011523046092184368, |
|
"grad_norm": 0.9951939582824707, |
|
"learning_rate": 0.0002990205926670015, |
|
"loss": 1.9181, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.012024048096192385, |
|
"grad_norm": 1.038643717765808, |
|
"learning_rate": 0.0002989452536413862, |
|
"loss": 2.0071, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.012525050100200401, |
|
"grad_norm": 0.9779028296470642, |
|
"learning_rate": 0.0002988699146157709, |
|
"loss": 2.1351, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.013026052104208416, |
|
"grad_norm": 0.9438173174858093, |
|
"learning_rate": 0.0002987945755901557, |
|
"loss": 2.0301, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.013527054108216433, |
|
"grad_norm": 0.9372079372406006, |
|
"learning_rate": 0.0002987192365645404, |
|
"loss": 1.8637, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.014028056112224449, |
|
"grad_norm": 0.9139612317085266, |
|
"learning_rate": 0.00029864389753892515, |
|
"loss": 1.9235, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.014529058116232466, |
|
"grad_norm": 0.9861688613891602, |
|
"learning_rate": 0.00029856855851330985, |
|
"loss": 1.8747, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.01503006012024048, |
|
"grad_norm": 0.9747604131698608, |
|
"learning_rate": 0.0002984932194876946, |
|
"loss": 1.9839, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.015531062124248497, |
|
"grad_norm": 0.9112619161605835, |
|
"learning_rate": 0.0002984178804620793, |
|
"loss": 2.1169, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01603206412825651, |
|
"grad_norm": 0.8841367363929749, |
|
"learning_rate": 0.0002983425414364641, |
|
"loss": 1.8372, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.016533066132264528, |
|
"grad_norm": 0.8845950961112976, |
|
"learning_rate": 0.0002982672024108488, |
|
"loss": 1.7765, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.017034068136272545, |
|
"grad_norm": 0.8608232736587524, |
|
"learning_rate": 0.00029819186338523354, |
|
"loss": 1.7745, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.01753507014028056, |
|
"grad_norm": 0.8822921514511108, |
|
"learning_rate": 0.00029811652435961824, |
|
"loss": 1.9187, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.018036072144288578, |
|
"grad_norm": 0.8354641199111938, |
|
"learning_rate": 0.00029804118533400295, |
|
"loss": 1.9194, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.018537074148296594, |
|
"grad_norm": 0.845734179019928, |
|
"learning_rate": 0.00029796584630838776, |
|
"loss": 1.95, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.01903807615230461, |
|
"grad_norm": 0.7486892342567444, |
|
"learning_rate": 0.00029789050728277247, |
|
"loss": 1.6453, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.019539078156312624, |
|
"grad_norm": 0.8062511682510376, |
|
"learning_rate": 0.00029781516825715717, |
|
"loss": 1.8782, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02004008016032064, |
|
"grad_norm": 0.7674809694290161, |
|
"learning_rate": 0.00029773982923154193, |
|
"loss": 1.8426, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.020541082164328657, |
|
"grad_norm": 0.7894824743270874, |
|
"learning_rate": 0.00029766449020592663, |
|
"loss": 1.8384, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.021042084168336674, |
|
"grad_norm": 0.7632150053977966, |
|
"learning_rate": 0.0002975891511803114, |
|
"loss": 1.7607, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02154308617234469, |
|
"grad_norm": 0.8097577095031738, |
|
"learning_rate": 0.0002975138121546961, |
|
"loss": 1.9556, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.022044088176352707, |
|
"grad_norm": 0.6966553926467896, |
|
"learning_rate": 0.0002974384731290808, |
|
"loss": 1.7072, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.022545090180360723, |
|
"grad_norm": 0.886178731918335, |
|
"learning_rate": 0.00029736313410346556, |
|
"loss": 2.3305, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.023046092184368736, |
|
"grad_norm": 0.7746443748474121, |
|
"learning_rate": 0.0002972877950778503, |
|
"loss": 2.1129, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.023547094188376753, |
|
"grad_norm": 0.8091543912887573, |
|
"learning_rate": 0.000297212456052235, |
|
"loss": 2.0141, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02404809619238477, |
|
"grad_norm": 0.7234994769096375, |
|
"learning_rate": 0.0002971371170266198, |
|
"loss": 1.9564, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.024549098196392786, |
|
"grad_norm": 0.7358165979385376, |
|
"learning_rate": 0.0002970617780010045, |
|
"loss": 1.9121, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.025050100200400802, |
|
"grad_norm": 0.7158864736557007, |
|
"learning_rate": 0.0002969864389753892, |
|
"loss": 1.7413, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02555110220440882, |
|
"grad_norm": 0.7489972710609436, |
|
"learning_rate": 0.00029691109994977395, |
|
"loss": 1.9817, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.026052104208416832, |
|
"grad_norm": 0.7536230087280273, |
|
"learning_rate": 0.00029683576092415866, |
|
"loss": 1.9258, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.02655310621242485, |
|
"grad_norm": 0.8415588140487671, |
|
"learning_rate": 0.0002967604218985434, |
|
"loss": 1.9913, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.027054108216432865, |
|
"grad_norm": 0.768453061580658, |
|
"learning_rate": 0.0002966850828729282, |
|
"loss": 2.0848, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.02755511022044088, |
|
"grad_norm": 0.7687628269195557, |
|
"learning_rate": 0.0002966097438473129, |
|
"loss": 1.9663, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.028056112224448898, |
|
"grad_norm": 0.7271875739097595, |
|
"learning_rate": 0.00029653440482169764, |
|
"loss": 1.9076, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.028557114228456915, |
|
"grad_norm": 0.6827739477157593, |
|
"learning_rate": 0.00029645906579608234, |
|
"loss": 1.7408, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.02905811623246493, |
|
"grad_norm": 0.805442214012146, |
|
"learning_rate": 0.00029638372677046705, |
|
"loss": 1.9494, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.029559118236472944, |
|
"grad_norm": 0.7472760677337646, |
|
"learning_rate": 0.0002963083877448518, |
|
"loss": 1.9039, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03006012024048096, |
|
"grad_norm": 0.737997829914093, |
|
"learning_rate": 0.00029623304871923657, |
|
"loss": 1.7575, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.030561122244488977, |
|
"grad_norm": 0.7741632461547852, |
|
"learning_rate": 0.00029615770969362127, |
|
"loss": 2.033, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.031062124248496994, |
|
"grad_norm": 0.7429640293121338, |
|
"learning_rate": 0.00029608237066800603, |
|
"loss": 1.899, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03156312625250501, |
|
"grad_norm": 0.7532819509506226, |
|
"learning_rate": 0.00029600703164239073, |
|
"loss": 1.8829, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03206412825651302, |
|
"grad_norm": 0.7010864615440369, |
|
"learning_rate": 0.00029593169261677544, |
|
"loss": 1.821, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03256513026052104, |
|
"grad_norm": 0.737218976020813, |
|
"learning_rate": 0.0002958563535911602, |
|
"loss": 1.9476, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.033066132264529056, |
|
"grad_norm": 0.7213771939277649, |
|
"learning_rate": 0.0002957810145655449, |
|
"loss": 1.9, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.03356713426853707, |
|
"grad_norm": 0.7673630714416504, |
|
"learning_rate": 0.00029570567553992966, |
|
"loss": 2.0354, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.03406813627254509, |
|
"grad_norm": 0.721883237361908, |
|
"learning_rate": 0.0002956303365143144, |
|
"loss": 1.7045, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.034569138276553106, |
|
"grad_norm": 0.7481613159179688, |
|
"learning_rate": 0.0002955549974886991, |
|
"loss": 1.8593, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.03507014028056112, |
|
"grad_norm": 0.7304365038871765, |
|
"learning_rate": 0.00029547965846308383, |
|
"loss": 1.752, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03557114228456914, |
|
"grad_norm": 0.7513949871063232, |
|
"learning_rate": 0.0002954043194374686, |
|
"loss": 1.9917, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.036072144288577156, |
|
"grad_norm": 0.7858319878578186, |
|
"learning_rate": 0.0002953289804118533, |
|
"loss": 2.087, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.03657314629258517, |
|
"grad_norm": 0.7061849236488342, |
|
"learning_rate": 0.00029525364138623805, |
|
"loss": 1.8584, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03707414829659319, |
|
"grad_norm": 0.7773633599281311, |
|
"learning_rate": 0.00029517830236062276, |
|
"loss": 1.9295, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.037575150300601205, |
|
"grad_norm": 0.7323243618011475, |
|
"learning_rate": 0.0002951029633350075, |
|
"loss": 1.9308, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03807615230460922, |
|
"grad_norm": 0.725524365901947, |
|
"learning_rate": 0.0002950276243093923, |
|
"loss": 1.9616, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03857715430861723, |
|
"grad_norm": 0.7054190039634705, |
|
"learning_rate": 0.000294952285283777, |
|
"loss": 1.6339, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.03907815631262525, |
|
"grad_norm": 0.6920881271362305, |
|
"learning_rate": 0.0002948769462581617, |
|
"loss": 1.809, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.039579158316633264, |
|
"grad_norm": 0.6498042941093445, |
|
"learning_rate": 0.00029480160723254644, |
|
"loss": 1.6668, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04008016032064128, |
|
"grad_norm": 0.6788399815559387, |
|
"learning_rate": 0.00029472626820693115, |
|
"loss": 1.713, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0405811623246493, |
|
"grad_norm": 0.7612844705581665, |
|
"learning_rate": 0.0002946509291813159, |
|
"loss": 1.9079, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.041082164328657314, |
|
"grad_norm": 0.6935157775878906, |
|
"learning_rate": 0.0002945755901557006, |
|
"loss": 1.7071, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04158316633266533, |
|
"grad_norm": 0.7637260556221008, |
|
"learning_rate": 0.00029450025113008537, |
|
"loss": 2.1187, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04208416833667335, |
|
"grad_norm": 0.7054077982902527, |
|
"learning_rate": 0.0002944249121044701, |
|
"loss": 1.7924, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.042585170340681364, |
|
"grad_norm": 0.701574981212616, |
|
"learning_rate": 0.00029434957307885483, |
|
"loss": 1.8163, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04308617234468938, |
|
"grad_norm": 0.7836251854896545, |
|
"learning_rate": 0.00029427423405323954, |
|
"loss": 1.6773, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0435871743486974, |
|
"grad_norm": 0.7043123245239258, |
|
"learning_rate": 0.0002941988950276243, |
|
"loss": 1.8403, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.04408817635270541, |
|
"grad_norm": 0.7133166790008545, |
|
"learning_rate": 0.000294123556002009, |
|
"loss": 1.8445, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.04458917835671343, |
|
"grad_norm": 0.7085109949111938, |
|
"learning_rate": 0.00029404821697639376, |
|
"loss": 1.8926, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.045090180360721446, |
|
"grad_norm": 0.701048731803894, |
|
"learning_rate": 0.00029397287795077847, |
|
"loss": 1.9562, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.045591182364729456, |
|
"grad_norm": 0.6904398202896118, |
|
"learning_rate": 0.0002938975389251632, |
|
"loss": 1.8759, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04609218436873747, |
|
"grad_norm": 0.7591177821159363, |
|
"learning_rate": 0.00029382219989954793, |
|
"loss": 1.9114, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.04659318637274549, |
|
"grad_norm": 0.7278134822845459, |
|
"learning_rate": 0.0002937468608739327, |
|
"loss": 1.9612, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.047094188376753505, |
|
"grad_norm": 0.7665961384773254, |
|
"learning_rate": 0.0002936715218483174, |
|
"loss": 2.0373, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.04759519038076152, |
|
"grad_norm": 0.7089666128158569, |
|
"learning_rate": 0.00029359618282270215, |
|
"loss": 2.062, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04809619238476954, |
|
"grad_norm": 0.6922783851623535, |
|
"learning_rate": 0.00029352084379708686, |
|
"loss": 1.8768, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.048597194388777555, |
|
"grad_norm": 0.6904755234718323, |
|
"learning_rate": 0.00029344550477147156, |
|
"loss": 1.7936, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.04909819639278557, |
|
"grad_norm": 0.7081161141395569, |
|
"learning_rate": 0.0002933701657458563, |
|
"loss": 1.7919, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.04959919839679359, |
|
"grad_norm": 0.7969085574150085, |
|
"learning_rate": 0.0002932948267202411, |
|
"loss": 2.0337, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.050100200400801605, |
|
"grad_norm": 0.7019357085227966, |
|
"learning_rate": 0.0002932194876946258, |
|
"loss": 1.9433, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05060120240480962, |
|
"grad_norm": 0.6784742474555969, |
|
"learning_rate": 0.00029314414866901054, |
|
"loss": 1.7878, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05110220440881764, |
|
"grad_norm": 0.7405863404273987, |
|
"learning_rate": 0.00029306880964339525, |
|
"loss": 1.7159, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.051603206412825654, |
|
"grad_norm": 0.6663607358932495, |
|
"learning_rate": 0.00029299347061777995, |
|
"loss": 1.7941, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.052104208416833664, |
|
"grad_norm": 0.7274753451347351, |
|
"learning_rate": 0.0002929181315921647, |
|
"loss": 1.8692, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05260521042084168, |
|
"grad_norm": 0.7117050886154175, |
|
"learning_rate": 0.0002928427925665494, |
|
"loss": 1.8508, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0531062124248497, |
|
"grad_norm": 0.6324647068977356, |
|
"learning_rate": 0.0002927674535409342, |
|
"loss": 1.6226, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.053607214428857713, |
|
"grad_norm": 0.6551498770713806, |
|
"learning_rate": 0.00029269211451531894, |
|
"loss": 1.6189, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.05410821643286573, |
|
"grad_norm": 0.7170161008834839, |
|
"learning_rate": 0.00029261677548970364, |
|
"loss": 1.9402, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.05460921843687375, |
|
"grad_norm": 0.6295390129089355, |
|
"learning_rate": 0.0002925414364640884, |
|
"loss": 1.6223, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.05511022044088176, |
|
"grad_norm": 0.6735630631446838, |
|
"learning_rate": 0.0002924660974384731, |
|
"loss": 1.9057, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05561122244488978, |
|
"grad_norm": 0.6805582046508789, |
|
"learning_rate": 0.0002923907584128578, |
|
"loss": 1.7012, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.056112224448897796, |
|
"grad_norm": 0.6882889866828918, |
|
"learning_rate": 0.00029231541938724257, |
|
"loss": 1.7673, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.05661322645290581, |
|
"grad_norm": 0.6445188522338867, |
|
"learning_rate": 0.00029224008036162727, |
|
"loss": 1.7567, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.05711422845691383, |
|
"grad_norm": 0.6788764595985413, |
|
"learning_rate": 0.00029216474133601203, |
|
"loss": 1.8939, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.057615230460921846, |
|
"grad_norm": 0.6786359548568726, |
|
"learning_rate": 0.0002920894023103968, |
|
"loss": 1.7107, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05811623246492986, |
|
"grad_norm": 0.73366779088974, |
|
"learning_rate": 0.0002920140632847815, |
|
"loss": 1.9247, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.05861723446893788, |
|
"grad_norm": 0.6751691699028015, |
|
"learning_rate": 0.0002919387242591662, |
|
"loss": 1.8489, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.05911823647294589, |
|
"grad_norm": 0.6604770421981812, |
|
"learning_rate": 0.00029186338523355096, |
|
"loss": 1.9441, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.059619238476953905, |
|
"grad_norm": 0.6513468027114868, |
|
"learning_rate": 0.00029178804620793566, |
|
"loss": 1.9047, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.06012024048096192, |
|
"grad_norm": 0.7937461137771606, |
|
"learning_rate": 0.0002917127071823204, |
|
"loss": 1.9885, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06062124248496994, |
|
"grad_norm": 0.6777886748313904, |
|
"learning_rate": 0.0002916373681567051, |
|
"loss": 2.0286, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.061122244488977955, |
|
"grad_norm": 0.6839383244514465, |
|
"learning_rate": 0.0002915620291310899, |
|
"loss": 1.8146, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.06162324649298597, |
|
"grad_norm": 0.6735116243362427, |
|
"learning_rate": 0.00029148669010547464, |
|
"loss": 1.7767, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.06212424849699399, |
|
"grad_norm": 0.7088850736618042, |
|
"learning_rate": 0.00029141135107985935, |
|
"loss": 1.9, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.062625250501002, |
|
"grad_norm": 0.6435878872871399, |
|
"learning_rate": 0.00029133601205424405, |
|
"loss": 1.7717, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06312625250501001, |
|
"grad_norm": 0.6343419551849365, |
|
"learning_rate": 0.0002912606730286288, |
|
"loss": 1.9353, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.06362725450901803, |
|
"grad_norm": 0.6651162505149841, |
|
"learning_rate": 0.0002911853340030135, |
|
"loss": 1.8287, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.06412825651302605, |
|
"grad_norm": 0.6920936703681946, |
|
"learning_rate": 0.0002911099949773983, |
|
"loss": 1.9564, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.06462925851703406, |
|
"grad_norm": 0.6264630556106567, |
|
"learning_rate": 0.00029103465595178304, |
|
"loss": 1.7342, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.06513026052104208, |
|
"grad_norm": 0.6150006055831909, |
|
"learning_rate": 0.00029095931692616774, |
|
"loss": 1.7269, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0656312625250501, |
|
"grad_norm": 0.6793592572212219, |
|
"learning_rate": 0.00029088397790055245, |
|
"loss": 1.9202, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.06613226452905811, |
|
"grad_norm": 0.7007801532745361, |
|
"learning_rate": 0.0002908086388749372, |
|
"loss": 1.9389, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.06663326653306613, |
|
"grad_norm": 0.5819805264472961, |
|
"learning_rate": 0.0002907332998493219, |
|
"loss": 1.4803, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.06713426853707415, |
|
"grad_norm": 0.6895102262496948, |
|
"learning_rate": 0.00029065796082370667, |
|
"loss": 1.8499, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.06763527054108216, |
|
"grad_norm": 0.6814382672309875, |
|
"learning_rate": 0.00029058262179809137, |
|
"loss": 1.9159, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06813627254509018, |
|
"grad_norm": 0.6768064498901367, |
|
"learning_rate": 0.0002905072827724761, |
|
"loss": 1.9397, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0686372745490982, |
|
"grad_norm": 0.6451818943023682, |
|
"learning_rate": 0.00029043194374686084, |
|
"loss": 1.7264, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.06913827655310621, |
|
"grad_norm": 0.6633362770080566, |
|
"learning_rate": 0.0002903566047212456, |
|
"loss": 1.7829, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.06963927855711423, |
|
"grad_norm": 0.659122109413147, |
|
"learning_rate": 0.0002902812656956303, |
|
"loss": 1.8371, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.07014028056112225, |
|
"grad_norm": 0.657511293888092, |
|
"learning_rate": 0.00029020592667001506, |
|
"loss": 1.8257, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07064128256513026, |
|
"grad_norm": 0.6987048387527466, |
|
"learning_rate": 0.00029013058764439976, |
|
"loss": 1.9275, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.07114228456913828, |
|
"grad_norm": 0.6087613105773926, |
|
"learning_rate": 0.0002900552486187845, |
|
"loss": 1.7095, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.0716432865731463, |
|
"grad_norm": 0.6897568702697754, |
|
"learning_rate": 0.00028997990959316923, |
|
"loss": 1.8934, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.07214428857715431, |
|
"grad_norm": 0.7103919386863708, |
|
"learning_rate": 0.00028990457056755393, |
|
"loss": 1.8729, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.07264529058116233, |
|
"grad_norm": 0.5942186117172241, |
|
"learning_rate": 0.0002898292315419387, |
|
"loss": 1.6083, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07314629258517034, |
|
"grad_norm": 0.6732192039489746, |
|
"learning_rate": 0.00028975389251632345, |
|
"loss": 1.9068, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.07364729458917836, |
|
"grad_norm": 0.6461887359619141, |
|
"learning_rate": 0.00028967855349070815, |
|
"loss": 1.6627, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.07414829659318638, |
|
"grad_norm": 0.6353530883789062, |
|
"learning_rate": 0.0002896032144650929, |
|
"loss": 1.8885, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0746492985971944, |
|
"grad_norm": 0.5766530632972717, |
|
"learning_rate": 0.0002895278754394776, |
|
"loss": 1.4533, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.07515030060120241, |
|
"grad_norm": 0.6750043630599976, |
|
"learning_rate": 0.0002894525364138623, |
|
"loss": 2.0198, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07565130260521043, |
|
"grad_norm": 0.589191734790802, |
|
"learning_rate": 0.0002893771973882471, |
|
"loss": 1.6217, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.07615230460921844, |
|
"grad_norm": 0.7778326272964478, |
|
"learning_rate": 0.00028930185836263184, |
|
"loss": 1.8912, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.07665330661322645, |
|
"grad_norm": 0.7026570439338684, |
|
"learning_rate": 0.00028922651933701655, |
|
"loss": 1.9116, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.07715430861723446, |
|
"grad_norm": 0.6697170734405518, |
|
"learning_rate": 0.0002891511803114013, |
|
"loss": 1.8512, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.07765531062124248, |
|
"grad_norm": 0.6900460124015808, |
|
"learning_rate": 0.000289075841285786, |
|
"loss": 1.9111, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0781563126252505, |
|
"grad_norm": 0.6959229111671448, |
|
"learning_rate": 0.0002890005022601707, |
|
"loss": 1.9443, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.07865731462925851, |
|
"grad_norm": 0.676102340221405, |
|
"learning_rate": 0.0002889251632345555, |
|
"loss": 1.7728, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.07915831663326653, |
|
"grad_norm": 0.6908376216888428, |
|
"learning_rate": 0.0002888498242089402, |
|
"loss": 1.7032, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.07965931863727455, |
|
"grad_norm": 0.6415160298347473, |
|
"learning_rate": 0.00028877448518332494, |
|
"loss": 1.644, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 0.7101219296455383, |
|
"learning_rate": 0.0002886991461577097, |
|
"loss": 1.9767, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08066132264529058, |
|
"grad_norm": 0.6580759882926941, |
|
"learning_rate": 0.0002886238071320944, |
|
"loss": 1.7492, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.0811623246492986, |
|
"grad_norm": 0.6474287509918213, |
|
"learning_rate": 0.00028854846810647916, |
|
"loss": 1.8357, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.08166332665330661, |
|
"grad_norm": 0.622855007648468, |
|
"learning_rate": 0.00028847312908086386, |
|
"loss": 1.7535, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.08216432865731463, |
|
"grad_norm": 0.6816668510437012, |
|
"learning_rate": 0.00028839779005524857, |
|
"loss": 2.0255, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.08266533066132264, |
|
"grad_norm": 0.663633406162262, |
|
"learning_rate": 0.00028832245102963333, |
|
"loss": 1.7689, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08316633266533066, |
|
"grad_norm": 0.6390612721443176, |
|
"learning_rate": 0.00028824711200401803, |
|
"loss": 1.735, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.08366733466933868, |
|
"grad_norm": 0.660830557346344, |
|
"learning_rate": 0.0002881717729784028, |
|
"loss": 1.7635, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.0841683366733467, |
|
"grad_norm": 0.6632187366485596, |
|
"learning_rate": 0.00028809643395278755, |
|
"loss": 1.983, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.08466933867735471, |
|
"grad_norm": 0.6418716311454773, |
|
"learning_rate": 0.00028802109492717226, |
|
"loss": 1.7174, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.08517034068136273, |
|
"grad_norm": 0.6375272274017334, |
|
"learning_rate": 0.00028794575590155696, |
|
"loss": 1.7768, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08567134268537074, |
|
"grad_norm": 0.6906164884567261, |
|
"learning_rate": 0.0002878704168759417, |
|
"loss": 1.9821, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.08617234468937876, |
|
"grad_norm": 0.6385218501091003, |
|
"learning_rate": 0.0002877950778503264, |
|
"loss": 1.7104, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.08667334669338678, |
|
"grad_norm": 0.6564759016036987, |
|
"learning_rate": 0.0002877197388247112, |
|
"loss": 1.8649, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.0871743486973948, |
|
"grad_norm": 0.6422074437141418, |
|
"learning_rate": 0.0002876443997990959, |
|
"loss": 1.899, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.08767535070140281, |
|
"grad_norm": 0.6718552112579346, |
|
"learning_rate": 0.00028756906077348065, |
|
"loss": 2.0651, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08817635270541083, |
|
"grad_norm": 0.6550820469856262, |
|
"learning_rate": 0.0002874937217478654, |
|
"loss": 1.9969, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.08867735470941884, |
|
"grad_norm": 0.6562731862068176, |
|
"learning_rate": 0.0002874183827222501, |
|
"loss": 1.8166, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.08917835671342686, |
|
"grad_norm": 0.6029812097549438, |
|
"learning_rate": 0.0002873430436966348, |
|
"loss": 1.7566, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.08967935871743488, |
|
"grad_norm": 0.6309391260147095, |
|
"learning_rate": 0.0002872677046710196, |
|
"loss": 1.8427, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.09018036072144289, |
|
"grad_norm": 0.598958432674408, |
|
"learning_rate": 0.0002871923656454043, |
|
"loss": 1.6701, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0906813627254509, |
|
"grad_norm": 0.6371743679046631, |
|
"learning_rate": 0.00028711702661978904, |
|
"loss": 1.7658, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.09118236472945891, |
|
"grad_norm": 0.6294912099838257, |
|
"learning_rate": 0.00028704168759417374, |
|
"loss": 1.6954, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.09168336673346693, |
|
"grad_norm": 0.6587108373641968, |
|
"learning_rate": 0.0002869663485685585, |
|
"loss": 1.8646, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.09218436873747494, |
|
"grad_norm": 0.6413068771362305, |
|
"learning_rate": 0.0002868910095429432, |
|
"loss": 2.0061, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.09268537074148296, |
|
"grad_norm": 0.6469525694847107, |
|
"learning_rate": 0.00028681567051732796, |
|
"loss": 1.821, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09318637274549098, |
|
"grad_norm": 0.6417365074157715, |
|
"learning_rate": 0.00028674033149171267, |
|
"loss": 1.8994, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.093687374749499, |
|
"grad_norm": 0.6416391134262085, |
|
"learning_rate": 0.00028666499246609743, |
|
"loss": 1.7628, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.09418837675350701, |
|
"grad_norm": 0.6492236852645874, |
|
"learning_rate": 0.00028658965344048213, |
|
"loss": 1.7946, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.09468937875751503, |
|
"grad_norm": 0.635639488697052, |
|
"learning_rate": 0.00028651431441486684, |
|
"loss": 1.7534, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.09519038076152304, |
|
"grad_norm": 0.6563454270362854, |
|
"learning_rate": 0.0002864389753892516, |
|
"loss": 1.7212, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09569138276553106, |
|
"grad_norm": 0.660172700881958, |
|
"learning_rate": 0.00028636363636363636, |
|
"loss": 1.7969, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.09619238476953908, |
|
"grad_norm": 0.6818733215332031, |
|
"learning_rate": 0.00028628829733802106, |
|
"loss": 1.8091, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.0966933867735471, |
|
"grad_norm": 0.6508086323738098, |
|
"learning_rate": 0.0002862129583124058, |
|
"loss": 1.8333, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.09719438877755511, |
|
"grad_norm": 0.6614415049552917, |
|
"learning_rate": 0.0002861376192867905, |
|
"loss": 1.9563, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.09769539078156313, |
|
"grad_norm": 0.6511570811271667, |
|
"learning_rate": 0.0002860622802611753, |
|
"loss": 1.9191, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09819639278557114, |
|
"grad_norm": 0.6392014026641846, |
|
"learning_rate": 0.00028598694123556, |
|
"loss": 1.8118, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.09869739478957916, |
|
"grad_norm": 0.6255333423614502, |
|
"learning_rate": 0.0002859116022099447, |
|
"loss": 1.9144, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.09919839679358718, |
|
"grad_norm": 0.6069381833076477, |
|
"learning_rate": 0.00028583626318432945, |
|
"loss": 1.8036, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.09969939879759519, |
|
"grad_norm": 0.6432672739028931, |
|
"learning_rate": 0.0002857609241587142, |
|
"loss": 1.8292, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.10020040080160321, |
|
"grad_norm": 0.6631090641021729, |
|
"learning_rate": 0.0002856855851330989, |
|
"loss": 1.8854, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 3992, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.89216143179776e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|