|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9963379544860058, |
|
"eval_steps": 500, |
|
"global_step": 954, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020925974365681402, |
|
"grad_norm": 0.5565130669449413, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.0288, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.041851948731362804, |
|
"grad_norm": 0.18606951178093903, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.0118, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0627779230970442, |
|
"grad_norm": 0.03950245765434584, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.0039, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08370389746272561, |
|
"grad_norm": 0.03819057800914349, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.0029, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10462987182840701, |
|
"grad_norm": 0.0544544775809417, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.0028, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1255558461940884, |
|
"grad_norm": 0.017750069891728056, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0024, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14648182055976983, |
|
"grad_norm": 0.025688827192495198, |
|
"learning_rate": 1.4583333333333333e-05, |
|
"loss": 0.0021, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16740779492545121, |
|
"grad_norm": 0.009892162090487124, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0013, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18833376929113263, |
|
"grad_norm": 0.013992833360284824, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.0006, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.20925974365681402, |
|
"grad_norm": 0.009570194364057852, |
|
"learning_rate": 1.9998927475076107e-05, |
|
"loss": 0.0029, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23018571802249543, |
|
"grad_norm": 0.011885486768552911, |
|
"learning_rate": 1.998686421164407e-05, |
|
"loss": 0.0019, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2511116923881768, |
|
"grad_norm": 0.00535124159298834, |
|
"learning_rate": 1.9961413253717214e-05, |
|
"loss": 0.0007, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2720376667538582, |
|
"grad_norm": 0.014087761849564343, |
|
"learning_rate": 1.9922608719076874e-05, |
|
"loss": 0.002, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29296364111953965, |
|
"grad_norm": 0.014243222134916782, |
|
"learning_rate": 1.9870502626379127e-05, |
|
"loss": 0.0011, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.31388961548522104, |
|
"grad_norm": 0.02081790679071402, |
|
"learning_rate": 1.980516482542224e-05, |
|
"loss": 0.0019, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33481558985090243, |
|
"grad_norm": 0.017403596614001808, |
|
"learning_rate": 1.972668290351084e-05, |
|
"loss": 0.0022, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3557415642165838, |
|
"grad_norm": 0.014181950638291016, |
|
"learning_rate": 1.9635162068042547e-05, |
|
"loss": 0.0013, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.37666753858226526, |
|
"grad_norm": 0.004266630945852344, |
|
"learning_rate": 1.9530725005474195e-05, |
|
"loss": 0.0016, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.39759351294794665, |
|
"grad_norm": 0.003584919486125673, |
|
"learning_rate": 1.9413511716856973e-05, |
|
"loss": 0.0017, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.41851948731362804, |
|
"grad_norm": 0.010407187744097775, |
|
"learning_rate": 1.9283679330160726e-05, |
|
"loss": 0.0006, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4394454616793094, |
|
"grad_norm": 0.011994286058888401, |
|
"learning_rate": 1.9141401889639167e-05, |
|
"loss": 0.004, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.46037143604499087, |
|
"grad_norm": 0.002559284420598525, |
|
"learning_rate": 1.898687012251826e-05, |
|
"loss": 0.0014, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.48129741041067226, |
|
"grad_norm": 0.01635207542112704, |
|
"learning_rate": 1.8820291183320602e-05, |
|
"loss": 0.0035, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5022233847763536, |
|
"grad_norm": 0.019845090979002694, |
|
"learning_rate": 1.8641888376168483e-05, |
|
"loss": 0.0004, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5231493591420351, |
|
"grad_norm": 0.037240037127226344, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 0.0032, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5440753335077164, |
|
"grad_norm": 0.006653507161649646, |
|
"learning_rate": 1.8250583305165098e-05, |
|
"loss": 0.0009, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5650013078733979, |
|
"grad_norm": 0.6020150741013875, |
|
"learning_rate": 1.8038205597634392e-05, |
|
"loss": 0.0011, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5859272822390793, |
|
"grad_norm": 0.05961677548525192, |
|
"learning_rate": 1.7815052431606702e-05, |
|
"loss": 0.0042, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6068532566047606, |
|
"grad_norm": 0.03667937697835753, |
|
"learning_rate": 1.7581422950671942e-05, |
|
"loss": 0.0029, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6277792309704421, |
|
"grad_norm": 0.02203193747208106, |
|
"learning_rate": 1.733763034223804e-05, |
|
"loss": 0.0017, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6487052053361234, |
|
"grad_norm": 0.03047813653712161, |
|
"learning_rate": 1.7084001417693702e-05, |
|
"loss": 0.0013, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6696311797018049, |
|
"grad_norm": 0.02403172913701344, |
|
"learning_rate": 1.682087617430782e-05, |
|
"loss": 0.001, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6905571540674863, |
|
"grad_norm": 0.020885112433327038, |
|
"learning_rate": 1.6548607339452853e-05, |
|
"loss": 0.0009, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7114831284331676, |
|
"grad_norm": 0.02143325180735118, |
|
"learning_rate": 1.626755989776303e-05, |
|
"loss": 0.0007, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7324091027988491, |
|
"grad_norm": 0.028502523632724423, |
|
"learning_rate": 1.5978110601861408e-05, |
|
"loss": 0.0016, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7533350771645305, |
|
"grad_norm": 0.00832352415812105, |
|
"learning_rate": 1.568064746731156e-05, |
|
"loss": 0.0009, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7742610515302119, |
|
"grad_norm": 0.009527918206519135, |
|
"learning_rate": 1.5375569252470897e-05, |
|
"loss": 0.0018, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7951870258958933, |
|
"grad_norm": 0.025970668054977514, |
|
"learning_rate": 1.506328492394303e-05, |
|
"loss": 0.0006, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8161130002615746, |
|
"grad_norm": 0.026371283237352123, |
|
"learning_rate": 1.4744213108345605e-05, |
|
"loss": 0.0023, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8370389746272561, |
|
"grad_norm": 0.030148449665476192, |
|
"learning_rate": 1.4418781531128636e-05, |
|
"loss": 0.0021, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8579649489929375, |
|
"grad_norm": 0.009123367946997817, |
|
"learning_rate": 1.4087426443195549e-05, |
|
"loss": 0.0009, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8788909233586188, |
|
"grad_norm": 0.033901495227343266, |
|
"learning_rate": 1.375059203609562e-05, |
|
"loss": 0.0023, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8998168977243003, |
|
"grad_norm": 0.007725818014371622, |
|
"learning_rate": 1.3408729846571716e-05, |
|
"loss": 0.0006, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9207428720899817, |
|
"grad_norm": 0.01076304174801131, |
|
"learning_rate": 1.3062298151261592e-05, |
|
"loss": 0.0009, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9416688464556631, |
|
"grad_norm": 0.08792335870716929, |
|
"learning_rate": 1.2711761352364172e-05, |
|
"loss": 0.0016, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9625948208213445, |
|
"grad_norm": 0.019193508970073957, |
|
"learning_rate": 1.2357589355094275e-05, |
|
"loss": 0.0005, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9835207951870258, |
|
"grad_norm": 0.008676099200769616, |
|
"learning_rate": 1.2000256937760446e-05, |
|
"loss": 0.0019, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0044467695527073, |
|
"grad_norm": 0.0014044382775048332, |
|
"learning_rate": 1.1640243115310219e-05, |
|
"loss": 0.0013, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0253727439183886, |
|
"grad_norm": 0.0047799061066043395, |
|
"learning_rate": 1.127803049719605e-05, |
|
"loss": 0.0008, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0462987182840702, |
|
"grad_norm": 0.006651240484213586, |
|
"learning_rate": 1.091410464042268e-05, |
|
"loss": 0.0004, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0672246926497515, |
|
"grad_norm": 0.007370492651937019, |
|
"learning_rate": 1.0548953398643276e-05, |
|
"loss": 0.0002, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0881506670154328, |
|
"grad_norm": 0.004999091099852188, |
|
"learning_rate": 1.0183066268176775e-05, |
|
"loss": 0.0013, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1090766413811144, |
|
"grad_norm": 0.0067648135423557494, |
|
"learning_rate": 9.81693373182323e-06, |
|
"loss": 0.0004, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1300026157467957, |
|
"grad_norm": 0.006355010008632378, |
|
"learning_rate": 9.451046601356725e-06, |
|
"loss": 0.0005, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.150928590112477, |
|
"grad_norm": 0.0019993701626922662, |
|
"learning_rate": 9.085895359577324e-06, |
|
"loss": 0.0012, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1718545644781586, |
|
"grad_norm": 0.07841604107285226, |
|
"learning_rate": 8.721969502803954e-06, |
|
"loss": 0.0007, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.19278053884384, |
|
"grad_norm": 0.008878084655677647, |
|
"learning_rate": 8.359756884689785e-06, |
|
"loss": 0.0006, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2137065132095213, |
|
"grad_norm": 0.022433859996456522, |
|
"learning_rate": 7.999743062239557e-06, |
|
"loss": 0.001, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2346324875752028, |
|
"grad_norm": 0.007321094493119974, |
|
"learning_rate": 7.642410644905726e-06, |
|
"loss": 0.0002, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2555584619408842, |
|
"grad_norm": 0.004130666592411099, |
|
"learning_rate": 7.2882386476358304e-06, |
|
"loss": 0.0008, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2764844363065655, |
|
"grad_norm": 0.009136384794392362, |
|
"learning_rate": 6.937701848738407e-06, |
|
"loss": 0.0005, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.297410410672247, |
|
"grad_norm": 0.014936959834984399, |
|
"learning_rate": 6.591270153428288e-06, |
|
"loss": 0.0015, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3183363850379284, |
|
"grad_norm": 0.00995968283183629, |
|
"learning_rate": 6.249407963904381e-06, |
|
"loss": 0.0003, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3392623594036097, |
|
"grad_norm": 0.007922264005744792, |
|
"learning_rate": 5.912573556804453e-06, |
|
"loss": 0.002, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3601883337692913, |
|
"grad_norm": 0.009732474218704226, |
|
"learning_rate": 5.581218468871365e-06, |
|
"loss": 0.0015, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3811143081349726, |
|
"grad_norm": 0.029322482465715245, |
|
"learning_rate": 5.2557868916543996e-06, |
|
"loss": 0.0004, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.402040282500654, |
|
"grad_norm": 0.002219147933917833, |
|
"learning_rate": 4.9367150760569746e-06, |
|
"loss": 0.0013, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4229662568663353, |
|
"grad_norm": 0.016601760824032873, |
|
"learning_rate": 4.6244307475291025e-06, |
|
"loss": 0.0007, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4438922312320168, |
|
"grad_norm": 0.03618373662395389, |
|
"learning_rate": 4.319352532688444e-06, |
|
"loss": 0.0016, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4648182055976982, |
|
"grad_norm": 0.008705175714719602, |
|
"learning_rate": 4.0218893981385935e-06, |
|
"loss": 0.0009, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4857441799633795, |
|
"grad_norm": 0.0029313903115019873, |
|
"learning_rate": 3.732440102236975e-06, |
|
"loss": 0.0002, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5066701543290608, |
|
"grad_norm": 0.020710402409599028, |
|
"learning_rate": 3.4513926605471504e-06, |
|
"loss": 0.0007, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5275961286947424, |
|
"grad_norm": 0.03507089834475249, |
|
"learning_rate": 3.1791238256921785e-06, |
|
"loss": 0.0021, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5485221030604237, |
|
"grad_norm": 0.07040984119700786, |
|
"learning_rate": 2.9159985823062997e-06, |
|
"loss": 0.0012, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.569448077426105, |
|
"grad_norm": 0.0046045007369338465, |
|
"learning_rate": 2.662369657761963e-06, |
|
"loss": 0.0005, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5903740517917866, |
|
"grad_norm": 0.044927829447691095, |
|
"learning_rate": 2.418577049328058e-06, |
|
"loss": 0.001, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.611300026157468, |
|
"grad_norm": 0.005367586480852439, |
|
"learning_rate": 2.1849475683932996e-06, |
|
"loss": 0.0005, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6322260005231493, |
|
"grad_norm": 0.014755038810438978, |
|
"learning_rate": 1.961794402365611e-06, |
|
"loss": 0.0012, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.6531519748888308, |
|
"grad_norm": 0.007586271120637286, |
|
"learning_rate": 1.7494166948349057e-06, |
|
"loss": 0.0023, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6740779492545121, |
|
"grad_norm": 0.01450560934843513, |
|
"learning_rate": 1.5480991445620541e-06, |
|
"loss": 0.001, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6950039236201935, |
|
"grad_norm": 0.007197310242295822, |
|
"learning_rate": 1.3581116238315194e-06, |
|
"loss": 0.0004, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.715929897985875, |
|
"grad_norm": 0.011429665649141593, |
|
"learning_rate": 1.1797088166794002e-06, |
|
"loss": 0.0009, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7368558723515564, |
|
"grad_norm": 0.005479589436376736, |
|
"learning_rate": 1.013129877481741e-06, |
|
"loss": 0.0015, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.7577818467172377, |
|
"grad_norm": 0.010284806366489312, |
|
"learning_rate": 8.585981103608343e-07, |
|
"loss": 0.0003, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7787078210829192, |
|
"grad_norm": 0.004805391506128972, |
|
"learning_rate": 7.163206698392744e-07, |
|
"loss": 0.0007, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7996337954486006, |
|
"grad_norm": 0.00832006380541767, |
|
"learning_rate": 5.864882831430274e-07, |
|
"loss": 0.0013, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.820559769814282, |
|
"grad_norm": 0.0016488023765299498, |
|
"learning_rate": 4.6927499452580574e-07, |
|
"loss": 0.0004, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8414857441799635, |
|
"grad_norm": 0.010828047107950923, |
|
"learning_rate": 3.6483793195745686e-07, |
|
"loss": 0.002, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.8624117185456448, |
|
"grad_norm": 0.00029272379323776047, |
|
"learning_rate": 2.733170964891607e-07, |
|
"loss": 0.0008, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8833376929113261, |
|
"grad_norm": 0.011245028659871065, |
|
"learning_rate": 1.9483517457776436e-07, |
|
"loss": 0.0013, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9042636672770077, |
|
"grad_norm": 0.005356139523384608, |
|
"learning_rate": 1.2949737362087156e-07, |
|
"loss": 0.0009, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.925189641642689, |
|
"grad_norm": 0.006490951580098314, |
|
"learning_rate": 7.73912809231292e-08, |
|
"loss": 0.0005, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9461156160083704, |
|
"grad_norm": 0.003383501070683301, |
|
"learning_rate": 3.858674628278825e-08, |
|
"loss": 0.003, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.967041590374052, |
|
"grad_norm": 0.012526402652665877, |
|
"learning_rate": 1.3135788355934652e-08, |
|
"loss": 0.0004, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9879675647397332, |
|
"grad_norm": 0.016656680718419487, |
|
"learning_rate": 1.0725249238940916e-09, |
|
"loss": 0.0006, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9963379544860058, |
|
"step": 954, |
|
"total_flos": 841298254757888.0, |
|
"train_loss": 0.001730952451793395, |
|
"train_runtime": 49706.0079, |
|
"train_samples_per_second": 2.461, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 954, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 841298254757888.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|