|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2604166666666667, |
|
"eval_steps": 500, |
|
"global_step": 25, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010416666666666666, |
|
"grad_norm": 0.46898284554481506, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4984, |
|
"objective/entropy": 1536.0, |
|
"step": 1, |
|
"train/nll_loss_a": 0.46235302090644836, |
|
"train/nll_loss_b": 0.5343712766965231, |
|
"val/completion_length": 141.19872029622397, |
|
"val/contain_eos_token": 0.9294871687889099, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.35256410638491315, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.03846153989434242, |
|
"val/fraction_both_incorrect": 0.628205140431722, |
|
"val/fraction_correct": 0.20512820780277252, |
|
"val/fraction_cyrillic_a": 0.09817602237065633, |
|
"val/fraction_cyrillic_b": 0.07149943461020787, |
|
"val/fraction_latin_a": 0.48449812332789105, |
|
"val/fraction_latin_b": 0.5023942093054453, |
|
"val/fraction_number_a": 0.18907449146111807, |
|
"val/fraction_number_b": 0.19445918997128805, |
|
"val/fraction_other_a": 0.22825137277444205, |
|
"val/fraction_other_b": 0.2316471884648005, |
|
"val/fraction_ties": 0.6666666666666666, |
|
"val/lang_prob_bg": 0.0268978967020909, |
|
"val/lang_prob_en": 0.6749410231908163, |
|
"val/latin_first_token": 0.6474358836809794, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.49648183584213257, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4431, |
|
"objective/entropy": 1424.0, |
|
"step": 2, |
|
"train/nll_loss_a": 0.4012756248315175, |
|
"train/nll_loss_b": 0.4849816660086314, |
|
"val/completion_length": 138.66666666666666, |
|
"val/contain_eos_token": 0.9102564056714376, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.32692308227221173, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.06410256524880727, |
|
"val/fraction_both_incorrect": 0.7179487148920695, |
|
"val/fraction_correct": 0.1730769251783689, |
|
"val/fraction_cyrillic_a": 0.0820654605825742, |
|
"val/fraction_cyrillic_b": 0.06807015091180801, |
|
"val/fraction_latin_a": 0.46912774443626404, |
|
"val/fraction_latin_b": 0.4876726269721985, |
|
"val/fraction_number_a": 0.21081160008907318, |
|
"val/fraction_number_b": 0.2032010406255722, |
|
"val/fraction_other_a": 0.2379952073097229, |
|
"val/fraction_other_b": 0.24105618397394815, |
|
"val/fraction_ties": 0.7820512851079305, |
|
"val/lang_prob_bg": 0.03282865695655346, |
|
"val/lang_prob_en": 0.6723186572392782, |
|
"val/latin_first_token": 0.6666666666666666, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.006410256649057071 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.6362881660461426, |
|
"learning_rate": 0.0001, |
|
"loss": 0.505, |
|
"objective/entropy": 1469.3333333333333, |
|
"step": 3, |
|
"train/nll_loss_a": 0.41949082414309186, |
|
"train/nll_loss_b": 0.5905094941457113, |
|
"val/completion_length": 153.07691955566406, |
|
"val/contain_eos_token": 0.878205140431722, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.3076923092206319, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.03846153989434242, |
|
"val/fraction_both_incorrect": 0.7820512851079305, |
|
"val/fraction_correct": 0.12820513298114142, |
|
"val/fraction_cyrillic_a": 0.09064680337905884, |
|
"val/fraction_cyrillic_b": 0.07422023018201192, |
|
"val/fraction_latin_a": 0.4563806454340617, |
|
"val/fraction_latin_b": 0.4618365466594696, |
|
"val/fraction_number_a": 0.21234740813573202, |
|
"val/fraction_number_b": 0.21217785278956094, |
|
"val/fraction_other_a": 0.24062515298525491, |
|
"val/fraction_other_b": 0.2517653902371724, |
|
"val/fraction_ties": 0.8205128312110901, |
|
"val/lang_prob_bg": 0.03160186484456062, |
|
"val/lang_prob_en": 0.6696631709734598, |
|
"val/latin_first_token": 0.6923076709111532, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.041666666666666664, |
|
"grad_norm": 0.5365626215934753, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3568, |
|
"objective/entropy": 1538.6666666666667, |
|
"step": 4, |
|
"train/nll_loss_a": 0.3584041992823283, |
|
"train/nll_loss_b": 0.3552741805712382, |
|
"val/completion_length": 139.39102172851562, |
|
"val/contain_eos_token": 0.9230769077936808, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.21794872482617697, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.08974359060327212, |
|
"val/fraction_both_incorrect": 0.7692307631174723, |
|
"val/fraction_correct": 0.1602564131220182, |
|
"val/fraction_cyrillic_a": 0.06686830148100853, |
|
"val/fraction_cyrillic_b": 0.040630811204512916, |
|
"val/fraction_latin_a": 0.48262248436609906, |
|
"val/fraction_latin_b": 0.4933239420255025, |
|
"val/fraction_number_a": 0.20719597240289053, |
|
"val/fraction_number_b": 0.22095757722854614, |
|
"val/fraction_other_a": 0.2433132529258728, |
|
"val/fraction_other_b": 0.24508768320083618, |
|
"val/fraction_ties": 0.8589743574460348, |
|
"val/lang_prob_bg": 0.02131816806892554, |
|
"val/lang_prob_en": 0.6912566820780436, |
|
"val/latin_first_token": 0.7820512851079305, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.052083333333333336, |
|
"grad_norm": 0.6317083835601807, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3542, |
|
"objective/entropy": 1616.0, |
|
"step": 5, |
|
"train/nll_loss_a": 0.3730636735757192, |
|
"train/nll_loss_b": 0.33537689844767254, |
|
"val/completion_length": 141.45512898763022, |
|
"val/contain_eos_token": 0.9230769276618958, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.03846153927346071, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.10256410514314969, |
|
"val/fraction_both_incorrect": 0.7692307829856873, |
|
"val/fraction_correct": 0.16666666915019354, |
|
"val/fraction_cyrillic_a": 0.018022120309372742, |
|
"val/fraction_cyrillic_b": 0.008442190863812963, |
|
"val/fraction_latin_a": 0.530072808265686, |
|
"val/fraction_latin_b": 0.5371540983517965, |
|
"val/fraction_number_a": 0.20419377585252127, |
|
"val/fraction_number_b": 0.19861711064974466, |
|
"val/fraction_other_a": 0.2477113058169683, |
|
"val/fraction_other_b": 0.25578661759694415, |
|
"val/fraction_ties": 0.871794859568278, |
|
"val/lang_prob_bg": 0.005829914240166545, |
|
"val/lang_prob_en": 0.6994746724764506, |
|
"val/latin_first_token": 0.9615384538968405, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.6112334728240967, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3526, |
|
"objective/entropy": 1450.6666666666667, |
|
"step": 6, |
|
"train/nll_loss_a": 0.3616310755411784, |
|
"train/nll_loss_b": 0.3435203830401103, |
|
"val/completion_length": 139.3397420247396, |
|
"val/contain_eos_token": 0.942307690779368, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.012820513298114141, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.07692307978868484, |
|
"val/fraction_both_incorrect": 0.7564102411270142, |
|
"val/fraction_correct": 0.1602564131220182, |
|
"val/fraction_cyrillic_a": 0.005665303532926676, |
|
"val/fraction_cyrillic_b": 0.0012210012258340914, |
|
"val/fraction_latin_a": 0.5337207714716593, |
|
"val/fraction_latin_b": 0.5389339327812195, |
|
"val/fraction_number_a": 0.20246068636576334, |
|
"val/fraction_number_b": 0.20605232814947763, |
|
"val/fraction_other_a": 0.2581532299518585, |
|
"val/fraction_other_b": 0.25379273295402527, |
|
"val/fraction_ties": 0.8333333134651184, |
|
"val/lang_prob_bg": 0.0024220591488604746, |
|
"val/lang_prob_en": 0.716150164604187, |
|
"val/latin_first_token": 0.9871794780095419, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.07291666666666667, |
|
"grad_norm": 0.5834570527076721, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3122, |
|
"objective/entropy": 1482.6666666666667, |
|
"step": 7, |
|
"train/nll_loss_a": 0.31628555059432983, |
|
"train/nll_loss_b": 0.3082062304019928, |
|
"val/completion_length": 140.28205362955728, |
|
"val/contain_eos_token": 0.935897429784139, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.012820513298114141, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.051282053192456566, |
|
"val/fraction_both_incorrect": 0.7948717872301737, |
|
"val/fraction_correct": 0.12820513298114142, |
|
"val/fraction_cyrillic_a": 0.004876403972351302, |
|
"val/fraction_cyrillic_b": 0.0055291604561110335, |
|
"val/fraction_latin_a": 0.51544189453125, |
|
"val/fraction_latin_b": 0.5175811052322388, |
|
"val/fraction_number_a": 0.21839049458503723, |
|
"val/fraction_number_b": 0.21114349365234375, |
|
"val/fraction_other_a": 0.2612912356853485, |
|
"val/fraction_other_b": 0.26574622591336566, |
|
"val/fraction_ties": 0.8461538553237915, |
|
"val/lang_prob_bg": 0.0024805181116486588, |
|
"val/lang_prob_en": 0.7189218997955322, |
|
"val/latin_first_token": 0.9871794780095419, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.5234330296516418, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3689, |
|
"objective/entropy": 1560.0, |
|
"step": 8, |
|
"train/nll_loss_a": 0.38403966029485065, |
|
"train/nll_loss_b": 0.353829691807429, |
|
"val/completion_length": 147.18589782714844, |
|
"val/contain_eos_token": 0.8910256226857504, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.032051283245285354, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.12820513173937798, |
|
"val/fraction_both_incorrect": 0.6794871886571249, |
|
"val/fraction_correct": 0.22435897588729858, |
|
"val/fraction_cyrillic_a": 0.005463951422522466, |
|
"val/fraction_cyrillic_b": 0.00364190728093187, |
|
"val/fraction_latin_a": 0.519624650478363, |
|
"val/fraction_latin_b": 0.5310182571411133, |
|
"val/fraction_number_a": 0.22126641869544983, |
|
"val/fraction_number_b": 0.20790701607863107, |
|
"val/fraction_other_a": 0.2536449631055196, |
|
"val/fraction_other_b": 0.25743279854456586, |
|
"val/fraction_ties": 0.807692309220632, |
|
"val/lang_prob_bg": 0.0022993393164748945, |
|
"val/lang_prob_en": 0.7228630383809408, |
|
"val/latin_first_token": 0.9679486950238546, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.46253740787506104, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3548, |
|
"objective/entropy": 1626.6666666666667, |
|
"step": 9, |
|
"train/nll_loss_a": 0.353506733973821, |
|
"train/nll_loss_b": 0.3560173710187276, |
|
"val/completion_length": 141.10897318522134, |
|
"val/contain_eos_token": 0.8974359035491943, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.03846153927346071, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.17948718120654425, |
|
"val/fraction_both_incorrect": 0.6666666865348816, |
|
"val/fraction_correct": 0.25641026099522907, |
|
"val/fraction_cyrillic_a": 0.009551782781879107, |
|
"val/fraction_cyrillic_b": 0.007778597995638847, |
|
"val/fraction_latin_a": 0.5005057454109192, |
|
"val/fraction_latin_b": 0.5112853447596232, |
|
"val/fraction_number_a": 0.2224580099185308, |
|
"val/fraction_number_b": 0.2299031764268875, |
|
"val/fraction_other_a": 0.26748446623484295, |
|
"val/fraction_other_b": 0.25103287398815155, |
|
"val/fraction_ties": 0.8461538354555765, |
|
"val/lang_prob_bg": 0.003838104816774527, |
|
"val/lang_prob_en": 0.7203066547711691, |
|
"val/latin_first_token": 0.9615384538968405, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.6793639063835144, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4072, |
|
"objective/entropy": 1544.0, |
|
"step": 10, |
|
"train/nll_loss_a": 0.40951302647590637, |
|
"train/nll_loss_b": 0.4048899710178375, |
|
"val/completion_length": 139.59615580240884, |
|
"val/contain_eos_token": 0.9230769276618958, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.044871795922517776, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.10256410514314969, |
|
"val/fraction_both_incorrect": 0.7564102609952291, |
|
"val/fraction_correct": 0.1730769251783689, |
|
"val/fraction_cyrillic_a": 0.01814807563399275, |
|
"val/fraction_cyrillic_b": 0.01091132735988746, |
|
"val/fraction_latin_a": 0.479096124569575, |
|
"val/fraction_latin_b": 0.48450469970703125, |
|
"val/fraction_number_a": 0.24191749095916748, |
|
"val/fraction_number_b": 0.23329021533330283, |
|
"val/fraction_other_a": 0.2608383099238078, |
|
"val/fraction_other_b": 0.2712937593460083, |
|
"val/fraction_ties": 0.8589743375778198, |
|
"val/lang_prob_bg": 0.005877171643078327, |
|
"val/lang_prob_en": 0.6905626058578491, |
|
"val/latin_first_token": 0.9551281929016113, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.11458333333333333, |
|
"grad_norm": 0.6676069498062134, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3261, |
|
"objective/entropy": 1296.0, |
|
"step": 11, |
|
"train/nll_loss_a": 0.3414422770341237, |
|
"train/nll_loss_b": 0.31081566711266834, |
|
"val/completion_length": 133.35897318522134, |
|
"val/contain_eos_token": 0.9615384538968405, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.012820513298114141, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.07692307854692142, |
|
"val/fraction_both_incorrect": 0.7692307631174723, |
|
"val/fraction_correct": 0.15384616081913313, |
|
"val/fraction_cyrillic_a": 0.009972451565166315, |
|
"val/fraction_cyrillic_b": 0.008257918680707613, |
|
"val/fraction_latin_a": 0.4708147446314494, |
|
"val/fraction_latin_b": 0.4841614067554474, |
|
"val/fraction_number_a": 0.2511301040649414, |
|
"val/fraction_number_b": 0.23997685313224792, |
|
"val/fraction_other_a": 0.26808270812034607, |
|
"val/fraction_other_b": 0.2676038245360057, |
|
"val/fraction_ties": 0.8461538354555765, |
|
"val/lang_prob_bg": 0.004965859581716359, |
|
"val/lang_prob_en": 0.7087472081184387, |
|
"val/latin_first_token": 0.9871794780095419, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.48878854513168335, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3202, |
|
"objective/entropy": 1520.0, |
|
"step": 12, |
|
"train/nll_loss_a": 0.3013697862625122, |
|
"train/nll_loss_b": 0.33911073207855225, |
|
"val/completion_length": 125.4551289876302, |
|
"val/contain_eos_token": 0.9551282127698263, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.01923076994717121, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.07692307854692142, |
|
"val/fraction_both_incorrect": 0.6410256425539652, |
|
"val/fraction_correct": 0.21794872482617697, |
|
"val/fraction_cyrillic_a": 0.0016106078983284533, |
|
"val/fraction_cyrillic_b": 0.00164324635018905, |
|
"val/fraction_latin_a": 0.4850431780020396, |
|
"val/fraction_latin_b": 0.49556367595990497, |
|
"val/fraction_number_a": 0.24694832662741342, |
|
"val/fraction_number_b": 0.24452554682890573, |
|
"val/fraction_other_a": 0.26639790336290997, |
|
"val/fraction_other_b": 0.25826754172643024, |
|
"val/fraction_ties": 0.7179487347602844, |
|
"val/lang_prob_bg": 0.0020299581810832024, |
|
"val/lang_prob_en": 0.7036298712094625, |
|
"val/latin_first_token": 0.9807692170143127, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.13541666666666666, |
|
"grad_norm": 0.47080346941947937, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3551, |
|
"objective/entropy": 1610.6666666666667, |
|
"step": 13, |
|
"train/nll_loss_a": 0.3537709911664327, |
|
"train/nll_loss_b": 0.35645443201065063, |
|
"val/completion_length": 132.8397471110026, |
|
"val/contain_eos_token": 0.942307690779368, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.01923076994717121, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.11538461844126384, |
|
"val/fraction_both_incorrect": 0.6410256425539652, |
|
"val/fraction_correct": 0.23717949291070303, |
|
"val/fraction_cyrillic_a": 0.0035886759869754314, |
|
"val/fraction_cyrillic_b": 0.002304682352890571, |
|
"val/fraction_latin_a": 0.5025050441424052, |
|
"val/fraction_latin_b": 0.49992923935254413, |
|
"val/fraction_number_a": 0.22844381630420685, |
|
"val/fraction_number_b": 0.23804503679275513, |
|
"val/fraction_other_a": 0.2654624879360199, |
|
"val/fraction_other_b": 0.2597210705280304, |
|
"val/fraction_ties": 0.7564102609952291, |
|
"val/lang_prob_bg": 0.0021097887850676975, |
|
"val/lang_prob_en": 0.7135748863220215, |
|
"val/latin_first_token": 0.9807692170143127, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.14583333333333334, |
|
"grad_norm": 0.5261266827583313, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3553, |
|
"objective/entropy": 1768.0, |
|
"step": 14, |
|
"train/nll_loss_a": 0.36536062757174176, |
|
"train/nll_loss_b": 0.34532251954078674, |
|
"val/completion_length": 130.3205134073893, |
|
"val/contain_eos_token": 0.942307690779368, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.006410256649057071, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.14102564255396524, |
|
"val/fraction_both_incorrect": 0.7435897588729858, |
|
"val/fraction_correct": 0.19871795177459717, |
|
"val/fraction_cyrillic_a": 0.0008547008813669285, |
|
"val/fraction_cyrillic_b": 0.0002670940205765267, |
|
"val/fraction_latin_a": 0.48295870423316956, |
|
"val/fraction_latin_b": 0.4754104216893514, |
|
"val/fraction_number_a": 0.24797451992829642, |
|
"val/fraction_number_b": 0.25482123096783954, |
|
"val/fraction_other_a": 0.26821208000183105, |
|
"val/fraction_other_b": 0.269501268863678, |
|
"val/fraction_ties": 0.8846153616905212, |
|
"val/lang_prob_bg": 0.0016354583591843646, |
|
"val/lang_prob_en": 0.7125194072723389, |
|
"val/latin_first_token": 0.9935897390047709, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.5608013868331909, |
|
"learning_rate": 0.0001, |
|
"loss": 0.375, |
|
"objective/entropy": 1813.3333333333333, |
|
"step": 15, |
|
"train/nll_loss_a": 0.3743097384770711, |
|
"train/nll_loss_b": 0.3756645123163859, |
|
"val/completion_length": 113.73076883951823, |
|
"val/contain_eos_token": 0.942307690779368, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.10256410514314969, |
|
"val/fraction_both_incorrect": 0.6666666666666666, |
|
"val/fraction_correct": 0.21794872482617697, |
|
"val/fraction_cyrillic_a": 0.0005128205132981142, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.4844670593738556, |
|
"val/fraction_latin_b": 0.48018120725949603, |
|
"val/fraction_number_a": 0.24094298481941223, |
|
"val/fraction_number_b": 0.2496140201886495, |
|
"val/fraction_other_a": 0.2740771571795146, |
|
"val/fraction_other_b": 0.2702048122882843, |
|
"val/fraction_ties": 0.7692307829856873, |
|
"val/lang_prob_bg": 0.0012819842668250203, |
|
"val/lang_prob_en": 0.712844451268514, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.5459577441215515, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3467, |
|
"objective/entropy": 1781.3333333333333, |
|
"step": 16, |
|
"train/nll_loss_a": 0.35024779041608173, |
|
"train/nll_loss_b": 0.3431568145751953, |
|
"val/completion_length": 125.86538696289062, |
|
"val/contain_eos_token": 0.9551281929016113, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.051282053192456566, |
|
"val/fraction_both_incorrect": 0.7179487148920695, |
|
"val/fraction_correct": 0.1666666716337204, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.4834948579470317, |
|
"val/fraction_latin_b": 0.48427216211954754, |
|
"val/fraction_number_a": 0.24997142453988394, |
|
"val/fraction_number_b": 0.24836017191410065, |
|
"val/fraction_other_a": 0.2665337175130844, |
|
"val/fraction_other_b": 0.26736770073572796, |
|
"val/fraction_ties": 0.7692307631174723, |
|
"val/lang_prob_bg": 0.0015082452834273379, |
|
"val/lang_prob_en": 0.6896043419837952, |
|
"val/latin_first_token": 0.9935897390047709, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.006410256649057071 |
|
}, |
|
{ |
|
"epoch": 0.17708333333333334, |
|
"grad_norm": 0.6729714870452881, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3411, |
|
"objective/entropy": 1653.3333333333333, |
|
"step": 17, |
|
"train/nll_loss_a": 0.3364799916744232, |
|
"train/nll_loss_b": 0.34577877322832745, |
|
"val/completion_length": 112.4551264444987, |
|
"val/contain_eos_token": 0.9807692368825277, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.012820513298114141, |
|
"val/fraction_both_incorrect": 0.7564102411270142, |
|
"val/fraction_correct": 0.12820513049761453, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.45367395877838135, |
|
"val/fraction_latin_b": 0.46362873911857605, |
|
"val/fraction_number_a": 0.2715826133886973, |
|
"val/fraction_number_b": 0.2677338620026906, |
|
"val/fraction_other_a": 0.2747434576352437, |
|
"val/fraction_other_b": 0.2686373790105184, |
|
"val/fraction_ties": 0.7692307631174723, |
|
"val/lang_prob_bg": 0.0013079955242574215, |
|
"val/lang_prob_en": 0.7055089473724365, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.5278697609901428, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3637, |
|
"objective/entropy": 1666.6666666666667, |
|
"step": 18, |
|
"train/nll_loss_a": 0.3630356788635254, |
|
"train/nll_loss_b": 0.36438990632692975, |
|
"val/completion_length": 106.38461303710938, |
|
"val/contain_eos_token": 0.9743589758872986, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.006410256649057071, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.08974359060327212, |
|
"val/fraction_both_incorrect": 0.7179487148920695, |
|
"val/fraction_correct": 0.18589743971824646, |
|
"val/fraction_cyrillic_a": 0.00018853696140771112, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.46353839834531146, |
|
"val/fraction_latin_b": 0.4717874725659688, |
|
"val/fraction_number_a": 0.26522815227508545, |
|
"val/fraction_number_b": 0.27208030720551807, |
|
"val/fraction_other_a": 0.27104492982228595, |
|
"val/fraction_other_b": 0.25613221526145935, |
|
"val/fraction_ties": 0.807692309220632, |
|
"val/lang_prob_bg": 0.0014787697000429034, |
|
"val/lang_prob_en": 0.7207486033439636, |
|
"val/latin_first_token": 0.9935897390047709, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.19791666666666666, |
|
"grad_norm": 0.623100996017456, |
|
"learning_rate": 0.0001, |
|
"loss": 0.339, |
|
"objective/entropy": 1757.3333333333333, |
|
"step": 19, |
|
"train/nll_loss_a": 0.3500674267609914, |
|
"train/nll_loss_b": 0.32788631319999695, |
|
"val/completion_length": 115.87820434570312, |
|
"val/contain_eos_token": 0.9615384538968405, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.07692307854692142, |
|
"val/fraction_both_incorrect": 0.7948717872301737, |
|
"val/fraction_correct": 0.14102564503749213, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.47628764311472577, |
|
"val/fraction_latin_b": 0.47780614097913104, |
|
"val/fraction_number_a": 0.25701290369033813, |
|
"val/fraction_number_b": 0.2525850087404251, |
|
"val/fraction_other_a": 0.2666994432608287, |
|
"val/fraction_other_b": 0.2696088453133901, |
|
"val/fraction_ties": 0.871794859568278, |
|
"val/lang_prob_bg": 0.0013311682268977165, |
|
"val/lang_prob_en": 0.7212471763292948, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.7427172064781189, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3136, |
|
"objective/entropy": 1717.3333333333333, |
|
"step": 20, |
|
"train/nll_loss_a": 0.3320723871390025, |
|
"train/nll_loss_b": 0.2952205240726471, |
|
"val/completion_length": 103.90384674072266, |
|
"val/contain_eos_token": 0.9871794780095419, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.06410256649057071, |
|
"val/fraction_both_incorrect": 0.7820512851079305, |
|
"val/fraction_correct": 0.14102564255396524, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.47085316975911456, |
|
"val/fraction_latin_b": 0.46781421701113385, |
|
"val/fraction_number_a": 0.2578504929939906, |
|
"val/fraction_number_b": 0.2744967540105184, |
|
"val/fraction_other_a": 0.27129634221394855, |
|
"val/fraction_other_b": 0.2576890190442403, |
|
"val/fraction_ties": 0.8461538354555765, |
|
"val/lang_prob_bg": 0.0014502551639452577, |
|
"val/lang_prob_en": 0.7153881192207336, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.4812980592250824, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2812, |
|
"objective/entropy": 1712.0, |
|
"step": 21, |
|
"train/nll_loss_a": 0.2877577245235443, |
|
"train/nll_loss_b": 0.27465402086575824, |
|
"val/completion_length": 100.94871775309245, |
|
"val/contain_eos_token": 0.9871794780095419, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.08974359308679898, |
|
"val/fraction_both_incorrect": 0.6794871886571249, |
|
"val/fraction_correct": 0.20512820780277252, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.449023316303889, |
|
"val/fraction_latin_b": 0.43931347131729126, |
|
"val/fraction_number_a": 0.2805411020914714, |
|
"val/fraction_number_b": 0.2886248826980591, |
|
"val/fraction_other_a": 0.2704355716705322, |
|
"val/fraction_other_b": 0.27206166585286456, |
|
"val/fraction_ties": 0.7692307631174723, |
|
"val/lang_prob_bg": 0.0012512764272590478, |
|
"val/lang_prob_en": 0.7036919593811035, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.22916666666666666, |
|
"grad_norm": 0.5802024006843567, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2079, |
|
"objective/entropy": 1754.6666666666667, |
|
"step": 22, |
|
"train/nll_loss_a": 0.2198613981405894, |
|
"train/nll_loss_b": 0.19591793914635977, |
|
"val/completion_length": 105.2051289876302, |
|
"val/contain_eos_token": 0.9615384538968405, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.08974359060327212, |
|
"val/fraction_both_incorrect": 0.8589743375778198, |
|
"val/fraction_correct": 0.11538461844126384, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.4607094426949819, |
|
"val/fraction_latin_b": 0.4435524543126424, |
|
"val/fraction_number_a": 0.2806512117385864, |
|
"val/fraction_number_b": 0.2736863394578298, |
|
"val/fraction_other_a": 0.2586393306652705, |
|
"val/fraction_other_b": 0.28276123603185016, |
|
"val/fraction_ties": 0.9487179517745972, |
|
"val/lang_prob_bg": 0.0013595524554451306, |
|
"val/lang_prob_en": 0.6991243163744608, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.23958333333333334, |
|
"grad_norm": 0.9169826507568359, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3761, |
|
"objective/entropy": 1493.3333333333333, |
|
"step": 23, |
|
"train/nll_loss_a": 0.36857877175013226, |
|
"train/nll_loss_b": 0.38360129793485004, |
|
"val/completion_length": 92.24359130859375, |
|
"val/contain_eos_token": 0.9935897390047709, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.025641026596228283, |
|
"val/fraction_both_incorrect": 0.7948717872301737, |
|
"val/fraction_correct": 0.11538461844126384, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.4382043182849884, |
|
"val/fraction_latin_b": 0.42687369386355084, |
|
"val/fraction_number_a": 0.2829488515853882, |
|
"val/fraction_number_b": 0.296395738919576, |
|
"val/fraction_other_a": 0.2788468599319458, |
|
"val/fraction_other_b": 0.2767305870850881, |
|
"val/fraction_ties": 0.8205128312110901, |
|
"val/lang_prob_bg": 0.0014082260507469375, |
|
"val/lang_prob_en": 0.7083008488019308, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6027104258537292, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2801, |
|
"objective/entropy": 1490.6666666666667, |
|
"step": 24, |
|
"train/nll_loss_a": 0.278068482875824, |
|
"train/nll_loss_b": 0.28207358221213025, |
|
"val/completion_length": 86.58333333333333, |
|
"val/contain_eos_token": 0.9935897390047709, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.08974359060327212, |
|
"val/fraction_both_incorrect": 0.6794871886571249, |
|
"val/fraction_correct": 0.20512820780277252, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.44356054067611694, |
|
"val/fraction_latin_b": 0.44240028659502667, |
|
"val/fraction_number_a": 0.2843793531258901, |
|
"val/fraction_number_b": 0.2815621296564738, |
|
"val/fraction_other_a": 0.272060106197993, |
|
"val/fraction_other_b": 0.2760376036167145, |
|
"val/fraction_ties": 0.7692307631174723, |
|
"val/lang_prob_bg": 0.0013788756914436817, |
|
"val/lang_prob_en": 0.707394023736318, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
}, |
|
{ |
|
"epoch": 0.2604166666666667, |
|
"grad_norm": 0.6588619351387024, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2373, |
|
"objective/entropy": 1805.3333333333333, |
|
"step": 25, |
|
"train/nll_loss_a": 0.2221569369236628, |
|
"train/nll_loss_b": 0.25250792503356934, |
|
"val/completion_length": 97.98077138264973, |
|
"val/contain_eos_token": 0.9871794780095419, |
|
"val/contains_guillemets": 0.0, |
|
"val/cyrillic_first_token": 0.0, |
|
"val/empty_batch": 0.0, |
|
"val/fraction_both_correct": 0.07692307854692142, |
|
"val/fraction_both_incorrect": 0.8461538354555765, |
|
"val/fraction_correct": 0.11538461844126384, |
|
"val/fraction_cyrillic_a": 0.0, |
|
"val/fraction_cyrillic_b": 0.0, |
|
"val/fraction_latin_a": 0.4407140811284383, |
|
"val/fraction_latin_b": 0.449174165725708, |
|
"val/fraction_number_a": 0.2688818077246348, |
|
"val/fraction_number_b": 0.264347364505132, |
|
"val/fraction_other_a": 0.2904041012128194, |
|
"val/fraction_other_b": 0.2864784598350525, |
|
"val/fraction_ties": 0.9230769077936808, |
|
"val/lang_prob_bg": 0.0015077214144791167, |
|
"val/lang_prob_en": 0.7170586188634237, |
|
"val/latin_first_token": 1.0, |
|
"val/number_first_token": 0.0, |
|
"val/other_first_token": 0.0 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 96, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 26, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|