cervisiarius
Upload model: nll-bg-en-trainAB_NO-MASKING
73ee3ff
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2604166666666667,
"eval_steps": 500,
"global_step": 25,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010416666666666666,
"grad_norm": 0.46898284554481506,
"learning_rate": 0.0001,
"loss": 0.4984,
"objective/entropy": 1536.0,
"step": 1,
"train/nll_loss_a": 0.46235302090644836,
"train/nll_loss_b": 0.5343712766965231,
"val/completion_length": 141.19872029622397,
"val/contain_eos_token": 0.9294871687889099,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.35256410638491315,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.628205140431722,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.09817602237065633,
"val/fraction_cyrillic_b": 0.07149943461020787,
"val/fraction_latin_a": 0.48449812332789105,
"val/fraction_latin_b": 0.5023942093054453,
"val/fraction_number_a": 0.18907449146111807,
"val/fraction_number_b": 0.19445918997128805,
"val/fraction_other_a": 0.22825137277444205,
"val/fraction_other_b": 0.2316471884648005,
"val/fraction_ties": 0.6666666666666666,
"val/lang_prob_bg": 0.0268978967020909,
"val/lang_prob_en": 0.6749410231908163,
"val/latin_first_token": 0.6474358836809794,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.020833333333333332,
"grad_norm": 0.49648183584213257,
"learning_rate": 0.0001,
"loss": 0.4431,
"objective/entropy": 1424.0,
"step": 2,
"train/nll_loss_a": 0.4012756248315175,
"train/nll_loss_b": 0.4849816660086314,
"val/completion_length": 138.66666666666666,
"val/contain_eos_token": 0.9102564056714376,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.32692308227221173,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.0820654605825742,
"val/fraction_cyrillic_b": 0.06807015091180801,
"val/fraction_latin_a": 0.46912774443626404,
"val/fraction_latin_b": 0.4876726269721985,
"val/fraction_number_a": 0.21081160008907318,
"val/fraction_number_b": 0.2032010406255722,
"val/fraction_other_a": 0.2379952073097229,
"val/fraction_other_b": 0.24105618397394815,
"val/fraction_ties": 0.7820512851079305,
"val/lang_prob_bg": 0.03282865695655346,
"val/lang_prob_en": 0.6723186572392782,
"val/latin_first_token": 0.6666666666666666,
"val/number_first_token": 0.0,
"val/other_first_token": 0.006410256649057071
},
{
"epoch": 0.03125,
"grad_norm": 0.6362881660461426,
"learning_rate": 0.0001,
"loss": 0.505,
"objective/entropy": 1469.3333333333333,
"step": 3,
"train/nll_loss_a": 0.41949082414309186,
"train/nll_loss_b": 0.5905094941457113,
"val/completion_length": 153.07691955566406,
"val/contain_eos_token": 0.878205140431722,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.3076923092206319,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.7820512851079305,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.09064680337905884,
"val/fraction_cyrillic_b": 0.07422023018201192,
"val/fraction_latin_a": 0.4563806454340617,
"val/fraction_latin_b": 0.4618365466594696,
"val/fraction_number_a": 0.21234740813573202,
"val/fraction_number_b": 0.21217785278956094,
"val/fraction_other_a": 0.24062515298525491,
"val/fraction_other_b": 0.2517653902371724,
"val/fraction_ties": 0.8205128312110901,
"val/lang_prob_bg": 0.03160186484456062,
"val/lang_prob_en": 0.6696631709734598,
"val/latin_first_token": 0.6923076709111532,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.041666666666666664,
"grad_norm": 0.5365626215934753,
"learning_rate": 0.0001,
"loss": 0.3568,
"objective/entropy": 1538.6666666666667,
"step": 4,
"train/nll_loss_a": 0.3584041992823283,
"train/nll_loss_b": 0.3552741805712382,
"val/completion_length": 139.39102172851562,
"val/contain_eos_token": 0.9230769077936808,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.21794872482617697,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.1602564131220182,
"val/fraction_cyrillic_a": 0.06686830148100853,
"val/fraction_cyrillic_b": 0.040630811204512916,
"val/fraction_latin_a": 0.48262248436609906,
"val/fraction_latin_b": 0.4933239420255025,
"val/fraction_number_a": 0.20719597240289053,
"val/fraction_number_b": 0.22095757722854614,
"val/fraction_other_a": 0.2433132529258728,
"val/fraction_other_b": 0.24508768320083618,
"val/fraction_ties": 0.8589743574460348,
"val/lang_prob_bg": 0.02131816806892554,
"val/lang_prob_en": 0.6912566820780436,
"val/latin_first_token": 0.7820512851079305,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.052083333333333336,
"grad_norm": 0.6317083835601807,
"learning_rate": 0.0001,
"loss": 0.3542,
"objective/entropy": 1616.0,
"step": 5,
"train/nll_loss_a": 0.3730636735757192,
"train/nll_loss_b": 0.33537689844767254,
"val/completion_length": 141.45512898763022,
"val/contain_eos_token": 0.9230769276618958,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.03846153927346071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7692307829856873,
"val/fraction_correct": 0.16666666915019354,
"val/fraction_cyrillic_a": 0.018022120309372742,
"val/fraction_cyrillic_b": 0.008442190863812963,
"val/fraction_latin_a": 0.530072808265686,
"val/fraction_latin_b": 0.5371540983517965,
"val/fraction_number_a": 0.20419377585252127,
"val/fraction_number_b": 0.19861711064974466,
"val/fraction_other_a": 0.2477113058169683,
"val/fraction_other_b": 0.25578661759694415,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.005829914240166545,
"val/lang_prob_en": 0.6994746724764506,
"val/latin_first_token": 0.9615384538968405,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.0625,
"grad_norm": 0.6112334728240967,
"learning_rate": 0.0001,
"loss": 0.3526,
"objective/entropy": 1450.6666666666667,
"step": 6,
"train/nll_loss_a": 0.3616310755411784,
"train/nll_loss_b": 0.3435203830401103,
"val/completion_length": 139.3397420247396,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307978868484,
"val/fraction_both_incorrect": 0.7564102411270142,
"val/fraction_correct": 0.1602564131220182,
"val/fraction_cyrillic_a": 0.005665303532926676,
"val/fraction_cyrillic_b": 0.0012210012258340914,
"val/fraction_latin_a": 0.5337207714716593,
"val/fraction_latin_b": 0.5389339327812195,
"val/fraction_number_a": 0.20246068636576334,
"val/fraction_number_b": 0.20605232814947763,
"val/fraction_other_a": 0.2581532299518585,
"val/fraction_other_b": 0.25379273295402527,
"val/fraction_ties": 0.8333333134651184,
"val/lang_prob_bg": 0.0024220591488604746,
"val/lang_prob_en": 0.716150164604187,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.07291666666666667,
"grad_norm": 0.5834570527076721,
"learning_rate": 0.0001,
"loss": 0.3122,
"objective/entropy": 1482.6666666666667,
"step": 7,
"train/nll_loss_a": 0.31628555059432983,
"train/nll_loss_b": 0.3082062304019928,
"val/completion_length": 140.28205362955728,
"val/contain_eos_token": 0.935897429784139,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.004876403972351302,
"val/fraction_cyrillic_b": 0.0055291604561110335,
"val/fraction_latin_a": 0.51544189453125,
"val/fraction_latin_b": 0.5175811052322388,
"val/fraction_number_a": 0.21839049458503723,
"val/fraction_number_b": 0.21114349365234375,
"val/fraction_other_a": 0.2612912356853485,
"val/fraction_other_b": 0.26574622591336566,
"val/fraction_ties": 0.8461538553237915,
"val/lang_prob_bg": 0.0024805181116486588,
"val/lang_prob_en": 0.7189218997955322,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.5234330296516418,
"learning_rate": 0.0001,
"loss": 0.3689,
"objective/entropy": 1560.0,
"step": 8,
"train/nll_loss_a": 0.38403966029485065,
"train/nll_loss_b": 0.353829691807429,
"val/completion_length": 147.18589782714844,
"val/contain_eos_token": 0.8910256226857504,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.032051283245285354,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.12820513173937798,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.22435897588729858,
"val/fraction_cyrillic_a": 0.005463951422522466,
"val/fraction_cyrillic_b": 0.00364190728093187,
"val/fraction_latin_a": 0.519624650478363,
"val/fraction_latin_b": 0.5310182571411133,
"val/fraction_number_a": 0.22126641869544983,
"val/fraction_number_b": 0.20790701607863107,
"val/fraction_other_a": 0.2536449631055196,
"val/fraction_other_b": 0.25743279854456586,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0022993393164748945,
"val/lang_prob_en": 0.7228630383809408,
"val/latin_first_token": 0.9679486950238546,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.09375,
"grad_norm": 0.46253740787506104,
"learning_rate": 0.0001,
"loss": 0.3548,
"objective/entropy": 1626.6666666666667,
"step": 9,
"train/nll_loss_a": 0.353506733973821,
"train/nll_loss_b": 0.3560173710187276,
"val/completion_length": 141.10897318522134,
"val/contain_eos_token": 0.8974359035491943,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.03846153927346071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.17948718120654425,
"val/fraction_both_incorrect": 0.6666666865348816,
"val/fraction_correct": 0.25641026099522907,
"val/fraction_cyrillic_a": 0.009551782781879107,
"val/fraction_cyrillic_b": 0.007778597995638847,
"val/fraction_latin_a": 0.5005057454109192,
"val/fraction_latin_b": 0.5112853447596232,
"val/fraction_number_a": 0.2224580099185308,
"val/fraction_number_b": 0.2299031764268875,
"val/fraction_other_a": 0.26748446623484295,
"val/fraction_other_b": 0.25103287398815155,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.003838104816774527,
"val/lang_prob_en": 0.7203066547711691,
"val/latin_first_token": 0.9615384538968405,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.6793639063835144,
"learning_rate": 0.0001,
"loss": 0.4072,
"objective/entropy": 1544.0,
"step": 10,
"train/nll_loss_a": 0.40951302647590637,
"train/nll_loss_b": 0.4048899710178375,
"val/completion_length": 139.59615580240884,
"val/contain_eos_token": 0.9230769276618958,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.044871795922517776,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7564102609952291,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.01814807563399275,
"val/fraction_cyrillic_b": 0.01091132735988746,
"val/fraction_latin_a": 0.479096124569575,
"val/fraction_latin_b": 0.48450469970703125,
"val/fraction_number_a": 0.24191749095916748,
"val/fraction_number_b": 0.23329021533330283,
"val/fraction_other_a": 0.2608383099238078,
"val/fraction_other_b": 0.2712937593460083,
"val/fraction_ties": 0.8589743375778198,
"val/lang_prob_bg": 0.005877171643078327,
"val/lang_prob_en": 0.6905626058578491,
"val/latin_first_token": 0.9551281929016113,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.6676069498062134,
"learning_rate": 0.0001,
"loss": 0.3261,
"objective/entropy": 1296.0,
"step": 11,
"train/nll_loss_a": 0.3414422770341237,
"train/nll_loss_b": 0.31081566711266834,
"val/completion_length": 133.35897318522134,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.15384616081913313,
"val/fraction_cyrillic_a": 0.009972451565166315,
"val/fraction_cyrillic_b": 0.008257918680707613,
"val/fraction_latin_a": 0.4708147446314494,
"val/fraction_latin_b": 0.4841614067554474,
"val/fraction_number_a": 0.2511301040649414,
"val/fraction_number_b": 0.23997685313224792,
"val/fraction_other_a": 0.26808270812034607,
"val/fraction_other_b": 0.2676038245360057,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.004965859581716359,
"val/lang_prob_en": 0.7087472081184387,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.125,
"grad_norm": 0.48878854513168335,
"learning_rate": 0.0001,
"loss": 0.3202,
"objective/entropy": 1520.0,
"step": 12,
"train/nll_loss_a": 0.3013697862625122,
"train/nll_loss_b": 0.33911073207855225,
"val/completion_length": 125.4551289876302,
"val/contain_eos_token": 0.9551282127698263,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.01923076994717121,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.6410256425539652,
"val/fraction_correct": 0.21794872482617697,
"val/fraction_cyrillic_a": 0.0016106078983284533,
"val/fraction_cyrillic_b": 0.00164324635018905,
"val/fraction_latin_a": 0.4850431780020396,
"val/fraction_latin_b": 0.49556367595990497,
"val/fraction_number_a": 0.24694832662741342,
"val/fraction_number_b": 0.24452554682890573,
"val/fraction_other_a": 0.26639790336290997,
"val/fraction_other_b": 0.25826754172643024,
"val/fraction_ties": 0.7179487347602844,
"val/lang_prob_bg": 0.0020299581810832024,
"val/lang_prob_en": 0.7036298712094625,
"val/latin_first_token": 0.9807692170143127,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.47080346941947937,
"learning_rate": 0.0001,
"loss": 0.3551,
"objective/entropy": 1610.6666666666667,
"step": 13,
"train/nll_loss_a": 0.3537709911664327,
"train/nll_loss_b": 0.35645443201065063,
"val/completion_length": 132.8397471110026,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.01923076994717121,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.11538461844126384,
"val/fraction_both_incorrect": 0.6410256425539652,
"val/fraction_correct": 0.23717949291070303,
"val/fraction_cyrillic_a": 0.0035886759869754314,
"val/fraction_cyrillic_b": 0.002304682352890571,
"val/fraction_latin_a": 0.5025050441424052,
"val/fraction_latin_b": 0.49992923935254413,
"val/fraction_number_a": 0.22844381630420685,
"val/fraction_number_b": 0.23804503679275513,
"val/fraction_other_a": 0.2654624879360199,
"val/fraction_other_b": 0.2597210705280304,
"val/fraction_ties": 0.7564102609952291,
"val/lang_prob_bg": 0.0021097887850676975,
"val/lang_prob_en": 0.7135748863220215,
"val/latin_first_token": 0.9807692170143127,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.5261266827583313,
"learning_rate": 0.0001,
"loss": 0.3553,
"objective/entropy": 1768.0,
"step": 14,
"train/nll_loss_a": 0.36536062757174176,
"train/nll_loss_b": 0.34532251954078674,
"val/completion_length": 130.3205134073893,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.006410256649057071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.14102564255396524,
"val/fraction_both_incorrect": 0.7435897588729858,
"val/fraction_correct": 0.19871795177459717,
"val/fraction_cyrillic_a": 0.0008547008813669285,
"val/fraction_cyrillic_b": 0.0002670940205765267,
"val/fraction_latin_a": 0.48295870423316956,
"val/fraction_latin_b": 0.4754104216893514,
"val/fraction_number_a": 0.24797451992829642,
"val/fraction_number_b": 0.25482123096783954,
"val/fraction_other_a": 0.26821208000183105,
"val/fraction_other_b": 0.269501268863678,
"val/fraction_ties": 0.8846153616905212,
"val/lang_prob_bg": 0.0016354583591843646,
"val/lang_prob_en": 0.7125194072723389,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.15625,
"grad_norm": 0.5608013868331909,
"learning_rate": 0.0001,
"loss": 0.375,
"objective/entropy": 1813.3333333333333,
"step": 15,
"train/nll_loss_a": 0.3743097384770711,
"train/nll_loss_b": 0.3756645123163859,
"val/completion_length": 113.73076883951823,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.6666666666666666,
"val/fraction_correct": 0.21794872482617697,
"val/fraction_cyrillic_a": 0.0005128205132981142,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4844670593738556,
"val/fraction_latin_b": 0.48018120725949603,
"val/fraction_number_a": 0.24094298481941223,
"val/fraction_number_b": 0.2496140201886495,
"val/fraction_other_a": 0.2740771571795146,
"val/fraction_other_b": 0.2702048122882843,
"val/fraction_ties": 0.7692307829856873,
"val/lang_prob_bg": 0.0012819842668250203,
"val/lang_prob_en": 0.712844451268514,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.5459577441215515,
"learning_rate": 0.0001,
"loss": 0.3467,
"objective/entropy": 1781.3333333333333,
"step": 16,
"train/nll_loss_a": 0.35024779041608173,
"train/nll_loss_b": 0.3431568145751953,
"val/completion_length": 125.86538696289062,
"val/contain_eos_token": 0.9551281929016113,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.1666666716337204,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4834948579470317,
"val/fraction_latin_b": 0.48427216211954754,
"val/fraction_number_a": 0.24997142453988394,
"val/fraction_number_b": 0.24836017191410065,
"val/fraction_other_a": 0.2665337175130844,
"val/fraction_other_b": 0.26736770073572796,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0015082452834273379,
"val/lang_prob_en": 0.6896043419837952,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.006410256649057071
},
{
"epoch": 0.17708333333333334,
"grad_norm": 0.6729714870452881,
"learning_rate": 0.0001,
"loss": 0.3411,
"objective/entropy": 1653.3333333333333,
"step": 17,
"train/nll_loss_a": 0.3364799916744232,
"train/nll_loss_b": 0.34577877322832745,
"val/completion_length": 112.4551264444987,
"val/contain_eos_token": 0.9807692368825277,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.012820513298114141,
"val/fraction_both_incorrect": 0.7564102411270142,
"val/fraction_correct": 0.12820513049761453,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.45367395877838135,
"val/fraction_latin_b": 0.46362873911857605,
"val/fraction_number_a": 0.2715826133886973,
"val/fraction_number_b": 0.2677338620026906,
"val/fraction_other_a": 0.2747434576352437,
"val/fraction_other_b": 0.2686373790105184,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0013079955242574215,
"val/lang_prob_en": 0.7055089473724365,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.1875,
"grad_norm": 0.5278697609901428,
"learning_rate": 0.0001,
"loss": 0.3637,
"objective/entropy": 1666.6666666666667,
"step": 18,
"train/nll_loss_a": 0.3630356788635254,
"train/nll_loss_b": 0.36438990632692975,
"val/completion_length": 106.38461303710938,
"val/contain_eos_token": 0.9743589758872986,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.006410256649057071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.18589743971824646,
"val/fraction_cyrillic_a": 0.00018853696140771112,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.46353839834531146,
"val/fraction_latin_b": 0.4717874725659688,
"val/fraction_number_a": 0.26522815227508545,
"val/fraction_number_b": 0.27208030720551807,
"val/fraction_other_a": 0.27104492982228595,
"val/fraction_other_b": 0.25613221526145935,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0014787697000429034,
"val/lang_prob_en": 0.7207486033439636,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.623100996017456,
"learning_rate": 0.0001,
"loss": 0.339,
"objective/entropy": 1757.3333333333333,
"step": 19,
"train/nll_loss_a": 0.3500674267609914,
"train/nll_loss_b": 0.32788631319999695,
"val/completion_length": 115.87820434570312,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.14102564503749213,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.47628764311472577,
"val/fraction_latin_b": 0.47780614097913104,
"val/fraction_number_a": 0.25701290369033813,
"val/fraction_number_b": 0.2525850087404251,
"val/fraction_other_a": 0.2666994432608287,
"val/fraction_other_b": 0.2696088453133901,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.0013311682268977165,
"val/lang_prob_en": 0.7212471763292948,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.7427172064781189,
"learning_rate": 0.0001,
"loss": 0.3136,
"objective/entropy": 1717.3333333333333,
"step": 20,
"train/nll_loss_a": 0.3320723871390025,
"train/nll_loss_b": 0.2952205240726471,
"val/completion_length": 103.90384674072266,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256649057071,
"val/fraction_both_incorrect": 0.7820512851079305,
"val/fraction_correct": 0.14102564255396524,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.47085316975911456,
"val/fraction_latin_b": 0.46781421701113385,
"val/fraction_number_a": 0.2578504929939906,
"val/fraction_number_b": 0.2744967540105184,
"val/fraction_other_a": 0.27129634221394855,
"val/fraction_other_b": 0.2576890190442403,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0014502551639452577,
"val/lang_prob_en": 0.7153881192207336,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.21875,
"grad_norm": 0.4812980592250824,
"learning_rate": 0.0001,
"loss": 0.2812,
"objective/entropy": 1712.0,
"step": 21,
"train/nll_loss_a": 0.2877577245235443,
"train/nll_loss_b": 0.27465402086575824,
"val/completion_length": 100.94871775309245,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359308679898,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.449023316303889,
"val/fraction_latin_b": 0.43931347131729126,
"val/fraction_number_a": 0.2805411020914714,
"val/fraction_number_b": 0.2886248826980591,
"val/fraction_other_a": 0.2704355716705322,
"val/fraction_other_b": 0.27206166585286456,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0012512764272590478,
"val/lang_prob_en": 0.7036919593811035,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.5802024006843567,
"learning_rate": 0.0001,
"loss": 0.2079,
"objective/entropy": 1754.6666666666667,
"step": 22,
"train/nll_loss_a": 0.2198613981405894,
"train/nll_loss_b": 0.19591793914635977,
"val/completion_length": 105.2051289876302,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.8589743375778198,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4607094426949819,
"val/fraction_latin_b": 0.4435524543126424,
"val/fraction_number_a": 0.2806512117385864,
"val/fraction_number_b": 0.2736863394578298,
"val/fraction_other_a": 0.2586393306652705,
"val/fraction_other_b": 0.28276123603185016,
"val/fraction_ties": 0.9487179517745972,
"val/lang_prob_bg": 0.0013595524554451306,
"val/lang_prob_en": 0.6991243163744608,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.23958333333333334,
"grad_norm": 0.9169826507568359,
"learning_rate": 0.0001,
"loss": 0.3761,
"objective/entropy": 1493.3333333333333,
"step": 23,
"train/nll_loss_a": 0.36857877175013226,
"train/nll_loss_b": 0.38360129793485004,
"val/completion_length": 92.24359130859375,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.025641026596228283,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4382043182849884,
"val/fraction_latin_b": 0.42687369386355084,
"val/fraction_number_a": 0.2829488515853882,
"val/fraction_number_b": 0.296395738919576,
"val/fraction_other_a": 0.2788468599319458,
"val/fraction_other_b": 0.2767305870850881,
"val/fraction_ties": 0.8205128312110901,
"val/lang_prob_bg": 0.0014082260507469375,
"val/lang_prob_en": 0.7083008488019308,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.25,
"grad_norm": 0.6027104258537292,
"learning_rate": 0.0001,
"loss": 0.2801,
"objective/entropy": 1490.6666666666667,
"step": 24,
"train/nll_loss_a": 0.278068482875824,
"train/nll_loss_b": 0.28207358221213025,
"val/completion_length": 86.58333333333333,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.44356054067611694,
"val/fraction_latin_b": 0.44240028659502667,
"val/fraction_number_a": 0.2843793531258901,
"val/fraction_number_b": 0.2815621296564738,
"val/fraction_other_a": 0.272060106197993,
"val/fraction_other_b": 0.2760376036167145,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0013788756914436817,
"val/lang_prob_en": 0.707394023736318,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.2604166666666667,
"grad_norm": 0.6588619351387024,
"learning_rate": 0.0001,
"loss": 0.2373,
"objective/entropy": 1805.3333333333333,
"step": 25,
"train/nll_loss_a": 0.2221569369236628,
"train/nll_loss_b": 0.25250792503356934,
"val/completion_length": 97.98077138264973,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.8461538354555765,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4407140811284383,
"val/fraction_latin_b": 0.449174165725708,
"val/fraction_number_a": 0.2688818077246348,
"val/fraction_number_b": 0.264347364505132,
"val/fraction_other_a": 0.2904041012128194,
"val/fraction_other_b": 0.2864784598350525,
"val/fraction_ties": 0.9230769077936808,
"val/lang_prob_bg": 0.0015077214144791167,
"val/lang_prob_en": 0.7170586188634237,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
}
],
"logging_steps": 1,
"max_steps": 96,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 26,
"trial_name": null,
"trial_params": null
}