cervisiarius
Upload model: nll-bg-en-trainAB_NO-MASKING
73ee3ff
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6770833333333334,
"eval_steps": 500,
"global_step": 65,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010416666666666666,
"grad_norm": 0.46898284554481506,
"learning_rate": 0.0001,
"loss": 0.4984,
"objective/entropy": 1536.0,
"step": 1,
"train/nll_loss_a": 0.46235302090644836,
"train/nll_loss_b": 0.5343712766965231,
"val/completion_length": 141.19872029622397,
"val/contain_eos_token": 0.9294871687889099,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.35256410638491315,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.628205140431722,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.09817602237065633,
"val/fraction_cyrillic_b": 0.07149943461020787,
"val/fraction_latin_a": 0.48449812332789105,
"val/fraction_latin_b": 0.5023942093054453,
"val/fraction_number_a": 0.18907449146111807,
"val/fraction_number_b": 0.19445918997128805,
"val/fraction_other_a": 0.22825137277444205,
"val/fraction_other_b": 0.2316471884648005,
"val/fraction_ties": 0.6666666666666666,
"val/lang_prob_bg": 0.0268978967020909,
"val/lang_prob_en": 0.6749410231908163,
"val/latin_first_token": 0.6474358836809794,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.020833333333333332,
"grad_norm": 0.49648183584213257,
"learning_rate": 0.0001,
"loss": 0.4431,
"objective/entropy": 1424.0,
"step": 2,
"train/nll_loss_a": 0.4012756248315175,
"train/nll_loss_b": 0.4849816660086314,
"val/completion_length": 138.66666666666666,
"val/contain_eos_token": 0.9102564056714376,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.32692308227221173,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.0820654605825742,
"val/fraction_cyrillic_b": 0.06807015091180801,
"val/fraction_latin_a": 0.46912774443626404,
"val/fraction_latin_b": 0.4876726269721985,
"val/fraction_number_a": 0.21081160008907318,
"val/fraction_number_b": 0.2032010406255722,
"val/fraction_other_a": 0.2379952073097229,
"val/fraction_other_b": 0.24105618397394815,
"val/fraction_ties": 0.7820512851079305,
"val/lang_prob_bg": 0.03282865695655346,
"val/lang_prob_en": 0.6723186572392782,
"val/latin_first_token": 0.6666666666666666,
"val/number_first_token": 0.0,
"val/other_first_token": 0.006410256649057071
},
{
"epoch": 0.03125,
"grad_norm": 0.6362881660461426,
"learning_rate": 0.0001,
"loss": 0.505,
"objective/entropy": 1469.3333333333333,
"step": 3,
"train/nll_loss_a": 0.41949082414309186,
"train/nll_loss_b": 0.5905094941457113,
"val/completion_length": 153.07691955566406,
"val/contain_eos_token": 0.878205140431722,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.3076923092206319,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.7820512851079305,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.09064680337905884,
"val/fraction_cyrillic_b": 0.07422023018201192,
"val/fraction_latin_a": 0.4563806454340617,
"val/fraction_latin_b": 0.4618365466594696,
"val/fraction_number_a": 0.21234740813573202,
"val/fraction_number_b": 0.21217785278956094,
"val/fraction_other_a": 0.24062515298525491,
"val/fraction_other_b": 0.2517653902371724,
"val/fraction_ties": 0.8205128312110901,
"val/lang_prob_bg": 0.03160186484456062,
"val/lang_prob_en": 0.6696631709734598,
"val/latin_first_token": 0.6923076709111532,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.041666666666666664,
"grad_norm": 0.5365626215934753,
"learning_rate": 0.0001,
"loss": 0.3568,
"objective/entropy": 1538.6666666666667,
"step": 4,
"train/nll_loss_a": 0.3584041992823283,
"train/nll_loss_b": 0.3552741805712382,
"val/completion_length": 139.39102172851562,
"val/contain_eos_token": 0.9230769077936808,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.21794872482617697,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.1602564131220182,
"val/fraction_cyrillic_a": 0.06686830148100853,
"val/fraction_cyrillic_b": 0.040630811204512916,
"val/fraction_latin_a": 0.48262248436609906,
"val/fraction_latin_b": 0.4933239420255025,
"val/fraction_number_a": 0.20719597240289053,
"val/fraction_number_b": 0.22095757722854614,
"val/fraction_other_a": 0.2433132529258728,
"val/fraction_other_b": 0.24508768320083618,
"val/fraction_ties": 0.8589743574460348,
"val/lang_prob_bg": 0.02131816806892554,
"val/lang_prob_en": 0.6912566820780436,
"val/latin_first_token": 0.7820512851079305,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.052083333333333336,
"grad_norm": 0.6317083835601807,
"learning_rate": 0.0001,
"loss": 0.3542,
"objective/entropy": 1616.0,
"step": 5,
"train/nll_loss_a": 0.3730636735757192,
"train/nll_loss_b": 0.33537689844767254,
"val/completion_length": 141.45512898763022,
"val/contain_eos_token": 0.9230769276618958,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.03846153927346071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7692307829856873,
"val/fraction_correct": 0.16666666915019354,
"val/fraction_cyrillic_a": 0.018022120309372742,
"val/fraction_cyrillic_b": 0.008442190863812963,
"val/fraction_latin_a": 0.530072808265686,
"val/fraction_latin_b": 0.5371540983517965,
"val/fraction_number_a": 0.20419377585252127,
"val/fraction_number_b": 0.19861711064974466,
"val/fraction_other_a": 0.2477113058169683,
"val/fraction_other_b": 0.25578661759694415,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.005829914240166545,
"val/lang_prob_en": 0.6994746724764506,
"val/latin_first_token": 0.9615384538968405,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.0625,
"grad_norm": 0.6112334728240967,
"learning_rate": 0.0001,
"loss": 0.3526,
"objective/entropy": 1450.6666666666667,
"step": 6,
"train/nll_loss_a": 0.3616310755411784,
"train/nll_loss_b": 0.3435203830401103,
"val/completion_length": 139.3397420247396,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307978868484,
"val/fraction_both_incorrect": 0.7564102411270142,
"val/fraction_correct": 0.1602564131220182,
"val/fraction_cyrillic_a": 0.005665303532926676,
"val/fraction_cyrillic_b": 0.0012210012258340914,
"val/fraction_latin_a": 0.5337207714716593,
"val/fraction_latin_b": 0.5389339327812195,
"val/fraction_number_a": 0.20246068636576334,
"val/fraction_number_b": 0.20605232814947763,
"val/fraction_other_a": 0.2581532299518585,
"val/fraction_other_b": 0.25379273295402527,
"val/fraction_ties": 0.8333333134651184,
"val/lang_prob_bg": 0.0024220591488604746,
"val/lang_prob_en": 0.716150164604187,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.07291666666666667,
"grad_norm": 0.5834570527076721,
"learning_rate": 0.0001,
"loss": 0.3122,
"objective/entropy": 1482.6666666666667,
"step": 7,
"train/nll_loss_a": 0.31628555059432983,
"train/nll_loss_b": 0.3082062304019928,
"val/completion_length": 140.28205362955728,
"val/contain_eos_token": 0.935897429784139,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.004876403972351302,
"val/fraction_cyrillic_b": 0.0055291604561110335,
"val/fraction_latin_a": 0.51544189453125,
"val/fraction_latin_b": 0.5175811052322388,
"val/fraction_number_a": 0.21839049458503723,
"val/fraction_number_b": 0.21114349365234375,
"val/fraction_other_a": 0.2612912356853485,
"val/fraction_other_b": 0.26574622591336566,
"val/fraction_ties": 0.8461538553237915,
"val/lang_prob_bg": 0.0024805181116486588,
"val/lang_prob_en": 0.7189218997955322,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.5234330296516418,
"learning_rate": 0.0001,
"loss": 0.3689,
"objective/entropy": 1560.0,
"step": 8,
"train/nll_loss_a": 0.38403966029485065,
"train/nll_loss_b": 0.353829691807429,
"val/completion_length": 147.18589782714844,
"val/contain_eos_token": 0.8910256226857504,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.032051283245285354,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.12820513173937798,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.22435897588729858,
"val/fraction_cyrillic_a": 0.005463951422522466,
"val/fraction_cyrillic_b": 0.00364190728093187,
"val/fraction_latin_a": 0.519624650478363,
"val/fraction_latin_b": 0.5310182571411133,
"val/fraction_number_a": 0.22126641869544983,
"val/fraction_number_b": 0.20790701607863107,
"val/fraction_other_a": 0.2536449631055196,
"val/fraction_other_b": 0.25743279854456586,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0022993393164748945,
"val/lang_prob_en": 0.7228630383809408,
"val/latin_first_token": 0.9679486950238546,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.09375,
"grad_norm": 0.46253740787506104,
"learning_rate": 0.0001,
"loss": 0.3548,
"objective/entropy": 1626.6666666666667,
"step": 9,
"train/nll_loss_a": 0.353506733973821,
"train/nll_loss_b": 0.3560173710187276,
"val/completion_length": 141.10897318522134,
"val/contain_eos_token": 0.8974359035491943,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.03846153927346071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.17948718120654425,
"val/fraction_both_incorrect": 0.6666666865348816,
"val/fraction_correct": 0.25641026099522907,
"val/fraction_cyrillic_a": 0.009551782781879107,
"val/fraction_cyrillic_b": 0.007778597995638847,
"val/fraction_latin_a": 0.5005057454109192,
"val/fraction_latin_b": 0.5112853447596232,
"val/fraction_number_a": 0.2224580099185308,
"val/fraction_number_b": 0.2299031764268875,
"val/fraction_other_a": 0.26748446623484295,
"val/fraction_other_b": 0.25103287398815155,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.003838104816774527,
"val/lang_prob_en": 0.7203066547711691,
"val/latin_first_token": 0.9615384538968405,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.6793639063835144,
"learning_rate": 0.0001,
"loss": 0.4072,
"objective/entropy": 1544.0,
"step": 10,
"train/nll_loss_a": 0.40951302647590637,
"train/nll_loss_b": 0.4048899710178375,
"val/completion_length": 139.59615580240884,
"val/contain_eos_token": 0.9230769276618958,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.044871795922517776,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7564102609952291,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.01814807563399275,
"val/fraction_cyrillic_b": 0.01091132735988746,
"val/fraction_latin_a": 0.479096124569575,
"val/fraction_latin_b": 0.48450469970703125,
"val/fraction_number_a": 0.24191749095916748,
"val/fraction_number_b": 0.23329021533330283,
"val/fraction_other_a": 0.2608383099238078,
"val/fraction_other_b": 0.2712937593460083,
"val/fraction_ties": 0.8589743375778198,
"val/lang_prob_bg": 0.005877171643078327,
"val/lang_prob_en": 0.6905626058578491,
"val/latin_first_token": 0.9551281929016113,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.6676069498062134,
"learning_rate": 0.0001,
"loss": 0.3261,
"objective/entropy": 1296.0,
"step": 11,
"train/nll_loss_a": 0.3414422770341237,
"train/nll_loss_b": 0.31081566711266834,
"val/completion_length": 133.35897318522134,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.012820513298114141,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.15384616081913313,
"val/fraction_cyrillic_a": 0.009972451565166315,
"val/fraction_cyrillic_b": 0.008257918680707613,
"val/fraction_latin_a": 0.4708147446314494,
"val/fraction_latin_b": 0.4841614067554474,
"val/fraction_number_a": 0.2511301040649414,
"val/fraction_number_b": 0.23997685313224792,
"val/fraction_other_a": 0.26808270812034607,
"val/fraction_other_b": 0.2676038245360057,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.004965859581716359,
"val/lang_prob_en": 0.7087472081184387,
"val/latin_first_token": 0.9871794780095419,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.125,
"grad_norm": 0.48878854513168335,
"learning_rate": 0.0001,
"loss": 0.3202,
"objective/entropy": 1520.0,
"step": 12,
"train/nll_loss_a": 0.3013697862625122,
"train/nll_loss_b": 0.33911073207855225,
"val/completion_length": 125.4551289876302,
"val/contain_eos_token": 0.9551282127698263,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.01923076994717121,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.6410256425539652,
"val/fraction_correct": 0.21794872482617697,
"val/fraction_cyrillic_a": 0.0016106078983284533,
"val/fraction_cyrillic_b": 0.00164324635018905,
"val/fraction_latin_a": 0.4850431780020396,
"val/fraction_latin_b": 0.49556367595990497,
"val/fraction_number_a": 0.24694832662741342,
"val/fraction_number_b": 0.24452554682890573,
"val/fraction_other_a": 0.26639790336290997,
"val/fraction_other_b": 0.25826754172643024,
"val/fraction_ties": 0.7179487347602844,
"val/lang_prob_bg": 0.0020299581810832024,
"val/lang_prob_en": 0.7036298712094625,
"val/latin_first_token": 0.9807692170143127,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.47080346941947937,
"learning_rate": 0.0001,
"loss": 0.3551,
"objective/entropy": 1610.6666666666667,
"step": 13,
"train/nll_loss_a": 0.3537709911664327,
"train/nll_loss_b": 0.35645443201065063,
"val/completion_length": 132.8397471110026,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.01923076994717121,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.11538461844126384,
"val/fraction_both_incorrect": 0.6410256425539652,
"val/fraction_correct": 0.23717949291070303,
"val/fraction_cyrillic_a": 0.0035886759869754314,
"val/fraction_cyrillic_b": 0.002304682352890571,
"val/fraction_latin_a": 0.5025050441424052,
"val/fraction_latin_b": 0.49992923935254413,
"val/fraction_number_a": 0.22844381630420685,
"val/fraction_number_b": 0.23804503679275513,
"val/fraction_other_a": 0.2654624879360199,
"val/fraction_other_b": 0.2597210705280304,
"val/fraction_ties": 0.7564102609952291,
"val/lang_prob_bg": 0.0021097887850676975,
"val/lang_prob_en": 0.7135748863220215,
"val/latin_first_token": 0.9807692170143127,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.5261266827583313,
"learning_rate": 0.0001,
"loss": 0.3553,
"objective/entropy": 1768.0,
"step": 14,
"train/nll_loss_a": 0.36536062757174176,
"train/nll_loss_b": 0.34532251954078674,
"val/completion_length": 130.3205134073893,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.006410256649057071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.14102564255396524,
"val/fraction_both_incorrect": 0.7435897588729858,
"val/fraction_correct": 0.19871795177459717,
"val/fraction_cyrillic_a": 0.0008547008813669285,
"val/fraction_cyrillic_b": 0.0002670940205765267,
"val/fraction_latin_a": 0.48295870423316956,
"val/fraction_latin_b": 0.4754104216893514,
"val/fraction_number_a": 0.24797451992829642,
"val/fraction_number_b": 0.25482123096783954,
"val/fraction_other_a": 0.26821208000183105,
"val/fraction_other_b": 0.269501268863678,
"val/fraction_ties": 0.8846153616905212,
"val/lang_prob_bg": 0.0016354583591843646,
"val/lang_prob_en": 0.7125194072723389,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.15625,
"grad_norm": 0.5608013868331909,
"learning_rate": 0.0001,
"loss": 0.375,
"objective/entropy": 1813.3333333333333,
"step": 15,
"train/nll_loss_a": 0.3743097384770711,
"train/nll_loss_b": 0.3756645123163859,
"val/completion_length": 113.73076883951823,
"val/contain_eos_token": 0.942307690779368,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.6666666666666666,
"val/fraction_correct": 0.21794872482617697,
"val/fraction_cyrillic_a": 0.0005128205132981142,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4844670593738556,
"val/fraction_latin_b": 0.48018120725949603,
"val/fraction_number_a": 0.24094298481941223,
"val/fraction_number_b": 0.2496140201886495,
"val/fraction_other_a": 0.2740771571795146,
"val/fraction_other_b": 0.2702048122882843,
"val/fraction_ties": 0.7692307829856873,
"val/lang_prob_bg": 0.0012819842668250203,
"val/lang_prob_en": 0.712844451268514,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.5459577441215515,
"learning_rate": 0.0001,
"loss": 0.3467,
"objective/entropy": 1781.3333333333333,
"step": 16,
"train/nll_loss_a": 0.35024779041608173,
"train/nll_loss_b": 0.3431568145751953,
"val/completion_length": 125.86538696289062,
"val/contain_eos_token": 0.9551281929016113,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.1666666716337204,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4834948579470317,
"val/fraction_latin_b": 0.48427216211954754,
"val/fraction_number_a": 0.24997142453988394,
"val/fraction_number_b": 0.24836017191410065,
"val/fraction_other_a": 0.2665337175130844,
"val/fraction_other_b": 0.26736770073572796,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0015082452834273379,
"val/lang_prob_en": 0.6896043419837952,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.006410256649057071
},
{
"epoch": 0.17708333333333334,
"grad_norm": 0.6729714870452881,
"learning_rate": 0.0001,
"loss": 0.3411,
"objective/entropy": 1653.3333333333333,
"step": 17,
"train/nll_loss_a": 0.3364799916744232,
"train/nll_loss_b": 0.34577877322832745,
"val/completion_length": 112.4551264444987,
"val/contain_eos_token": 0.9807692368825277,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.012820513298114141,
"val/fraction_both_incorrect": 0.7564102411270142,
"val/fraction_correct": 0.12820513049761453,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.45367395877838135,
"val/fraction_latin_b": 0.46362873911857605,
"val/fraction_number_a": 0.2715826133886973,
"val/fraction_number_b": 0.2677338620026906,
"val/fraction_other_a": 0.2747434576352437,
"val/fraction_other_b": 0.2686373790105184,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0013079955242574215,
"val/lang_prob_en": 0.7055089473724365,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.1875,
"grad_norm": 0.5278697609901428,
"learning_rate": 0.0001,
"loss": 0.3637,
"objective/entropy": 1666.6666666666667,
"step": 18,
"train/nll_loss_a": 0.3630356788635254,
"train/nll_loss_b": 0.36438990632692975,
"val/completion_length": 106.38461303710938,
"val/contain_eos_token": 0.9743589758872986,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.006410256649057071,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.18589743971824646,
"val/fraction_cyrillic_a": 0.00018853696140771112,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.46353839834531146,
"val/fraction_latin_b": 0.4717874725659688,
"val/fraction_number_a": 0.26522815227508545,
"val/fraction_number_b": 0.27208030720551807,
"val/fraction_other_a": 0.27104492982228595,
"val/fraction_other_b": 0.25613221526145935,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0014787697000429034,
"val/lang_prob_en": 0.7207486033439636,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.623100996017456,
"learning_rate": 0.0001,
"loss": 0.339,
"objective/entropy": 1757.3333333333333,
"step": 19,
"train/nll_loss_a": 0.3500674267609914,
"train/nll_loss_b": 0.32788631319999695,
"val/completion_length": 115.87820434570312,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.14102564503749213,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.47628764311472577,
"val/fraction_latin_b": 0.47780614097913104,
"val/fraction_number_a": 0.25701290369033813,
"val/fraction_number_b": 0.2525850087404251,
"val/fraction_other_a": 0.2666994432608287,
"val/fraction_other_b": 0.2696088453133901,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.0013311682268977165,
"val/lang_prob_en": 0.7212471763292948,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.7427172064781189,
"learning_rate": 0.0001,
"loss": 0.3136,
"objective/entropy": 1717.3333333333333,
"step": 20,
"train/nll_loss_a": 0.3320723871390025,
"train/nll_loss_b": 0.2952205240726471,
"val/completion_length": 103.90384674072266,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256649057071,
"val/fraction_both_incorrect": 0.7820512851079305,
"val/fraction_correct": 0.14102564255396524,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.47085316975911456,
"val/fraction_latin_b": 0.46781421701113385,
"val/fraction_number_a": 0.2578504929939906,
"val/fraction_number_b": 0.2744967540105184,
"val/fraction_other_a": 0.27129634221394855,
"val/fraction_other_b": 0.2576890190442403,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0014502551639452577,
"val/lang_prob_en": 0.7153881192207336,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.21875,
"grad_norm": 0.4812980592250824,
"learning_rate": 0.0001,
"loss": 0.2812,
"objective/entropy": 1712.0,
"step": 21,
"train/nll_loss_a": 0.2877577245235443,
"train/nll_loss_b": 0.27465402086575824,
"val/completion_length": 100.94871775309245,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359308679898,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.449023316303889,
"val/fraction_latin_b": 0.43931347131729126,
"val/fraction_number_a": 0.2805411020914714,
"val/fraction_number_b": 0.2886248826980591,
"val/fraction_other_a": 0.2704355716705322,
"val/fraction_other_b": 0.27206166585286456,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0012512764272590478,
"val/lang_prob_en": 0.7036919593811035,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.5802024006843567,
"learning_rate": 0.0001,
"loss": 0.2079,
"objective/entropy": 1754.6666666666667,
"step": 22,
"train/nll_loss_a": 0.2198613981405894,
"train/nll_loss_b": 0.19591793914635977,
"val/completion_length": 105.2051289876302,
"val/contain_eos_token": 0.9615384538968405,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.8589743375778198,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4607094426949819,
"val/fraction_latin_b": 0.4435524543126424,
"val/fraction_number_a": 0.2806512117385864,
"val/fraction_number_b": 0.2736863394578298,
"val/fraction_other_a": 0.2586393306652705,
"val/fraction_other_b": 0.28276123603185016,
"val/fraction_ties": 0.9487179517745972,
"val/lang_prob_bg": 0.0013595524554451306,
"val/lang_prob_en": 0.6991243163744608,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.23958333333333334,
"grad_norm": 0.9169826507568359,
"learning_rate": 0.0001,
"loss": 0.3761,
"objective/entropy": 1493.3333333333333,
"step": 23,
"train/nll_loss_a": 0.36857877175013226,
"train/nll_loss_b": 0.38360129793485004,
"val/completion_length": 92.24359130859375,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.025641026596228283,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4382043182849884,
"val/fraction_latin_b": 0.42687369386355084,
"val/fraction_number_a": 0.2829488515853882,
"val/fraction_number_b": 0.296395738919576,
"val/fraction_other_a": 0.2788468599319458,
"val/fraction_other_b": 0.2767305870850881,
"val/fraction_ties": 0.8205128312110901,
"val/lang_prob_bg": 0.0014082260507469375,
"val/lang_prob_en": 0.7083008488019308,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.25,
"grad_norm": 0.6027104258537292,
"learning_rate": 0.0001,
"loss": 0.2801,
"objective/entropy": 1490.6666666666667,
"step": 24,
"train/nll_loss_a": 0.278068482875824,
"train/nll_loss_b": 0.28207358221213025,
"val/completion_length": 86.58333333333333,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.6794871886571249,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.44356054067611694,
"val/fraction_latin_b": 0.44240028659502667,
"val/fraction_number_a": 0.2843793531258901,
"val/fraction_number_b": 0.2815621296564738,
"val/fraction_other_a": 0.272060106197993,
"val/fraction_other_b": 0.2760376036167145,
"val/fraction_ties": 0.7692307631174723,
"val/lang_prob_bg": 0.0013788756914436817,
"val/lang_prob_en": 0.707394023736318,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.2604166666666667,
"grad_norm": 0.6588619351387024,
"learning_rate": 0.0001,
"loss": 0.2373,
"objective/entropy": 1805.3333333333333,
"step": 25,
"train/nll_loss_a": 0.2221569369236628,
"train/nll_loss_b": 0.25250792503356934,
"val/completion_length": 97.98077138264973,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.8461538354555765,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.4407140811284383,
"val/fraction_latin_b": 0.449174165725708,
"val/fraction_number_a": 0.2688818077246348,
"val/fraction_number_b": 0.264347364505132,
"val/fraction_other_a": 0.2904041012128194,
"val/fraction_other_b": 0.2864784598350525,
"val/fraction_ties": 0.9230769077936808,
"val/lang_prob_bg": 0.0015077214144791167,
"val/lang_prob_en": 0.7170586188634237,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.2708333333333333,
"grad_norm": 0.7737333178520203,
"learning_rate": 0.0001,
"loss": 0.2641,
"objective/entropy": 1989.3333333333333,
"step": 26,
"train/nll_loss_a": 0.26104875405629474,
"train/nll_loss_b": 0.26720015704631805,
"val/completion_length": 93.08974202473958,
"val/contain_eos_token": 0.9743589758872986,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.11538461844126384,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.19871795177459717,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.43882275621096295,
"val/fraction_latin_b": 0.4281532069047292,
"val/fraction_number_a": 0.29091211160024005,
"val/fraction_number_b": 0.3004717230796814,
"val/fraction_other_a": 0.2702651371558507,
"val/fraction_other_b": 0.2713750700155894,
"val/fraction_ties": 0.8333333134651184,
"val/lang_prob_bg": 0.0013183245512967308,
"val/lang_prob_en": 0.6850736141204834,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.28125,
"grad_norm": 0.7201813459396362,
"learning_rate": 0.0001,
"loss": 0.2468,
"objective/entropy": 1309.3333333333333,
"step": 27,
"train/nll_loss_a": 0.2481851428747177,
"train/nll_loss_b": 0.24535122017065683,
"val/completion_length": 81.52563985188802,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307730515797,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.15384615709384283,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.42121972640355426,
"val/fraction_latin_b": 0.3958790997664134,
"val/fraction_number_a": 0.30145979921023053,
"val/fraction_number_b": 0.3247312208016713,
"val/fraction_other_a": 0.2773204942544301,
"val/fraction_other_b": 0.2793896694978078,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0012046323778728645,
"val/lang_prob_en": 0.690701444943746,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.5405112504959106,
"learning_rate": 0.0001,
"loss": 0.2421,
"objective/entropy": 1466.6666666666667,
"step": 28,
"train/nll_loss_a": 0.258198360602061,
"train/nll_loss_b": 0.225906973083814,
"val/completion_length": 91.30128224690755,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359060327212,
"val/fraction_both_incorrect": 0.6025640964508057,
"val/fraction_correct": 0.2435897489388784,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.41246400276819867,
"val/fraction_latin_b": 0.4263813893000285,
"val/fraction_number_a": 0.30804495016733807,
"val/fraction_number_b": 0.299861341714859,
"val/fraction_other_a": 0.27949108680089313,
"val/fraction_other_b": 0.2737572491168976,
"val/fraction_ties": 0.692307710647583,
"val/lang_prob_bg": 0.0014168053554991882,
"val/lang_prob_en": 0.7019821604092916,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3020833333333333,
"grad_norm": 0.6843443512916565,
"learning_rate": 0.0001,
"loss": 0.2173,
"objective/entropy": 1717.3333333333333,
"step": 29,
"train/nll_loss_a": 0.2260708212852478,
"train/nll_loss_b": 0.208594411611557,
"val/completion_length": 86.48076883951823,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359184503555,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.18589743971824646,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.41310587525367737,
"val/fraction_latin_b": 0.4215882420539856,
"val/fraction_number_a": 0.3036368489265442,
"val/fraction_number_b": 0.3048081199328105,
"val/fraction_other_a": 0.2832573155562083,
"val/fraction_other_b": 0.2736036380132039,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0014089663745835423,
"val/lang_prob_en": 0.6810818711916605,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3125,
"grad_norm": 0.6732069849967957,
"learning_rate": 0.0001,
"loss": 0.1954,
"objective/entropy": 1698.6666666666667,
"step": 30,
"train/nll_loss_a": 0.190487802028656,
"train/nll_loss_b": 0.20023786028226218,
"val/completion_length": 83.47436014811198,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.692307690779368,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.37804505228996277,
"val/fraction_latin_b": 0.3864828248818715,
"val/fraction_number_a": 0.32556501030921936,
"val/fraction_number_b": 0.3207090497016907,
"val/fraction_other_a": 0.2963899274667104,
"val/fraction_other_b": 0.29280807574590045,
"val/fraction_ties": 0.7307692368825277,
"val/lang_prob_bg": 0.0012525273875022929,
"val/lang_prob_en": 0.6773750185966492,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3229166666666667,
"grad_norm": 1.181767225265503,
"learning_rate": 0.0001,
"loss": 0.2398,
"objective/entropy": 1254.6666666666667,
"step": 31,
"train/nll_loss_a": 0.2306948055823644,
"train/nll_loss_b": 0.2489608426888784,
"val/completion_length": 80.55127970377605,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.8461538354555765,
"val/fraction_correct": 0.09615384911497434,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.3839823206265767,
"val/fraction_latin_b": 0.40403392910957336,
"val/fraction_number_a": 0.3259160916010539,
"val/fraction_number_b": 0.3016844590504964,
"val/fraction_other_a": 0.2901015877723694,
"val/fraction_other_b": 0.29428161183993023,
"val/fraction_ties": 0.8846153815587362,
"val/lang_prob_bg": 0.001477631429831187,
"val/lang_prob_en": 0.693560004234314,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.006410256649057071,
"val/other_first_token": 0.0
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.7345649600028992,
"learning_rate": 0.0001,
"loss": 0.2023,
"objective/entropy": 1706.6666666666667,
"step": 32,
"train/nll_loss_a": 0.18820939461390176,
"train/nll_loss_b": 0.2164141039053599,
"val/completion_length": 86.39102681477864,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.025641026596228283,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.3734320302804311,
"val/fraction_latin_b": 0.39075586199760437,
"val/fraction_number_a": 0.3208834727605184,
"val/fraction_number_b": 0.3199572165807088,
"val/fraction_other_a": 0.30568451682726544,
"val/fraction_other_b": 0.28928691148757935,
"val/fraction_ties": 0.8205128113428751,
"val/lang_prob_bg": 0.001555828144773841,
"val/lang_prob_en": 0.6997750600179037,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.34375,
"grad_norm": 0.6794171929359436,
"learning_rate": 0.0001,
"loss": 0.208,
"objective/entropy": 1765.3333333333333,
"step": 33,
"train/nll_loss_a": 0.197875847419103,
"train/nll_loss_b": 0.2181676377852758,
"val/completion_length": 81.42307790120442,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.12820513049761453,
"val/fraction_both_incorrect": 0.692307710647583,
"val/fraction_correct": 0.21794872482617697,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.36189671357472736,
"val/fraction_latin_b": 0.3368810514609019,
"val/fraction_number_a": 0.34198596080144245,
"val/fraction_number_b": 0.3494710822900136,
"val/fraction_other_a": 0.2961173454920451,
"val/fraction_other_b": 0.3136478662490845,
"val/fraction_ties": 0.8205128113428751,
"val/lang_prob_bg": 0.0014510581968352199,
"val/lang_prob_en": 0.6856165130933126,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3541666666666667,
"grad_norm": 1.2053074836730957,
"learning_rate": 0.0001,
"loss": 0.1771,
"objective/entropy": 1565.3333333333333,
"step": 34,
"train/nll_loss_a": 0.17536027232805887,
"train/nll_loss_b": 0.1789391835530599,
"val/completion_length": 75.41666666666667,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.34743695457776386,
"val/fraction_latin_b": 0.3424769341945648,
"val/fraction_number_a": 0.3374132215976715,
"val/fraction_number_b": 0.3465127448240916,
"val/fraction_other_a": 0.31514982382456463,
"val/fraction_other_b": 0.31101036071777344,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0016215697008495529,
"val/lang_prob_en": 0.6558386087417603,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3645833333333333,
"grad_norm": 0.8247714042663574,
"learning_rate": 0.0001,
"loss": 0.1831,
"objective/entropy": 2050.6666666666665,
"step": 35,
"train/nll_loss_a": 0.1831777443488439,
"train/nll_loss_b": 0.18294000625610352,
"val/completion_length": 79.56410217285156,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307978868484,
"val/fraction_both_incorrect": 0.7179487347602844,
"val/fraction_correct": 0.17948718617359796,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.3364667197068532,
"val/fraction_latin_b": 0.3431033293406169,
"val/fraction_number_a": 0.35041239857673645,
"val/fraction_number_b": 0.3312891523043315,
"val/fraction_other_a": 0.31312089165051776,
"val/fraction_other_b": 0.3256075282891591,
"val/fraction_ties": 0.7948718070983887,
"val/lang_prob_bg": 0.0014793235653390486,
"val/lang_prob_en": 0.6674719850222269,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.375,
"grad_norm": 0.7892264127731323,
"learning_rate": 0.0001,
"loss": 0.2262,
"objective/entropy": 2029.3333333333333,
"step": 36,
"train/nll_loss_a": 0.21969079971313477,
"train/nll_loss_b": 0.23275785644849142,
"val/completion_length": 81.88461558024089,
"val/contain_eos_token": 0.9807692170143127,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359184503555,
"val/fraction_both_incorrect": 0.7564102609952291,
"val/fraction_correct": 0.16666666915019354,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.33385714888572693,
"val/fraction_latin_b": 0.31389813621838886,
"val/fraction_number_a": 0.36293908953666687,
"val/fraction_number_b": 0.368167241414388,
"val/fraction_other_a": 0.3032037814458211,
"val/fraction_other_b": 0.3179346521695455,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0014597336606432993,
"val/lang_prob_en": 0.6864216725031534,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.3854166666666667,
"grad_norm": 0.9461960196495056,
"learning_rate": 0.0001,
"loss": 0.1815,
"objective/entropy": 2218.6666666666665,
"step": 37,
"train/nll_loss_a": 0.1762543668349584,
"train/nll_loss_b": 0.18676617741584778,
"val/completion_length": 78.16666666666667,
"val/contain_eos_token": 0.9743589560190836,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307978868484,
"val/fraction_both_incorrect": 0.7307692368825277,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.31174399455388385,
"val/fraction_latin_b": 0.3052766025066376,
"val/fraction_number_a": 0.3656782905260722,
"val/fraction_number_b": 0.37882108489672345,
"val/fraction_other_a": 0.32257768511772156,
"val/fraction_other_b": 0.3159022927284241,
"val/fraction_ties": 0.807692309220632,
"val/lang_prob_bg": 0.0016823052040611703,
"val/lang_prob_en": 0.682081917921702,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.006410256649057071,
"val/other_first_token": 0.0
},
{
"epoch": 0.3958333333333333,
"grad_norm": 0.7971638441085815,
"learning_rate": 0.0001,
"loss": 0.1558,
"objective/entropy": 1834.6666666666667,
"step": 38,
"train/nll_loss_a": 0.1687836398681005,
"train/nll_loss_b": 0.142802856862545,
"val/completion_length": 75.09615325927734,
"val/contain_eos_token": 0.9807692170143127,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.11538461844126384,
"val/fraction_both_incorrect": 0.7692307829856873,
"val/fraction_correct": 0.1730769251783689,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.3084398905436198,
"val/fraction_latin_b": 0.3190213441848755,
"val/fraction_number_a": 0.38394031922022503,
"val/fraction_number_b": 0.3600513239701589,
"val/fraction_other_a": 0.3076198200384776,
"val/fraction_other_b": 0.32092733184496564,
"val/fraction_ties": 0.8846153815587362,
"val/lang_prob_bg": 0.0013936974961931508,
"val/lang_prob_en": 0.6955586870511373,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.40625,
"grad_norm": 0.5998182892799377,
"learning_rate": 0.0001,
"loss": 0.1452,
"objective/entropy": 928.0,
"step": 39,
"train/nll_loss_a": 0.14895252386728922,
"train/nll_loss_b": 0.14153108249107996,
"val/completion_length": 71.1602554321289,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359308679898,
"val/fraction_both_incorrect": 0.7051282127698263,
"val/fraction_correct": 0.19230769326289496,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.28946439425150555,
"val/fraction_latin_b": 0.287621130545934,
"val/fraction_number_a": 0.3734682301680247,
"val/fraction_number_b": 0.38121453921000165,
"val/fraction_other_a": 0.3370673954486847,
"val/fraction_other_b": 0.33116433024406433,
"val/fraction_ties": 0.7948718070983887,
"val/lang_prob_bg": 0.0014785424573346972,
"val/lang_prob_en": 0.6788028081258138,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.006410256649057071,
"val/other_first_token": 0.0
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.0486812591552734,
"learning_rate": 0.0001,
"loss": 0.1397,
"objective/entropy": 1610.6666666666667,
"step": 40,
"train/nll_loss_a": 0.124597763021787,
"train/nll_loss_b": 0.15475992610057196,
"val/completion_length": 66.44871775309245,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.03846153989434242,
"val/fraction_both_incorrect": 0.8846153815587362,
"val/fraction_correct": 0.07692307730515797,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.26715315381685895,
"val/fraction_latin_b": 0.267175629734993,
"val/fraction_number_a": 0.40364818771680194,
"val/fraction_number_b": 0.40295613805452984,
"val/fraction_other_a": 0.3291986882686615,
"val/fraction_other_b": 0.32986828684806824,
"val/fraction_ties": 0.9230769077936808,
"val/lang_prob_bg": 0.0016380803814778726,
"val/lang_prob_en": 0.6558632055918375,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4270833333333333,
"grad_norm": 1.1442676782608032,
"learning_rate": 0.0001,
"loss": 0.1368,
"objective/entropy": 1390.6666666666667,
"step": 41,
"train/nll_loss_a": 0.13521244128545126,
"train/nll_loss_b": 0.13834577798843384,
"val/completion_length": 66.53845977783203,
"val/contain_eos_token": 0.9871794780095419,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.012820513298114141,
"val/fraction_both_incorrect": 0.8589743375778198,
"val/fraction_correct": 0.07692307916780312,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2713295817375183,
"val/fraction_latin_b": 0.2757815072933833,
"val/fraction_number_a": 0.39802590012550354,
"val/fraction_number_b": 0.3848887085914612,
"val/fraction_other_a": 0.33064452807108563,
"val/fraction_other_b": 0.3393297791481018,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.0014013544811556737,
"val/lang_prob_en": 0.6843119462331136,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4375,
"grad_norm": 1.032313585281372,
"learning_rate": 0.0001,
"loss": 0.1505,
"objective/entropy": 1338.6666666666667,
"step": 42,
"train/nll_loss_a": 0.1546641836563746,
"train/nll_loss_b": 0.146413487692674,
"val/completion_length": 57.61538314819336,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256649057071,
"val/fraction_both_incorrect": 0.8333333333333334,
"val/fraction_correct": 0.11538461595773697,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2686810294787089,
"val/fraction_latin_b": 0.2704887390136719,
"val/fraction_number_a": 0.39721407492955524,
"val/fraction_number_b": 0.38931016127268475,
"val/fraction_other_a": 0.33410489559173584,
"val/fraction_other_b": 0.3402010997136434,
"val/fraction_ties": 0.8974359035491943,
"val/lang_prob_bg": 0.0015042958548292518,
"val/lang_prob_en": 0.6720715363820394,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4479166666666667,
"grad_norm": 1.00838303565979,
"learning_rate": 0.0001,
"loss": 0.1867,
"objective/entropy": 1114.6666666666667,
"step": 43,
"train/nll_loss_a": 0.16402535637219748,
"train/nll_loss_b": 0.20946120719114938,
"val/completion_length": 57.775641123453774,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.051282053192456566,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.12820513173937798,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2534715036551158,
"val/fraction_latin_b": 0.26074858009815216,
"val/fraction_number_a": 0.3934909800688426,
"val/fraction_number_b": 0.40743691722551983,
"val/fraction_other_a": 0.3530375460783641,
"val/fraction_other_b": 0.3318144778410594,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0016378250826771061,
"val/lang_prob_en": 0.6859935522079468,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4583333333333333,
"grad_norm": 0.746932327747345,
"learning_rate": 0.0001,
"loss": 0.1126,
"objective/entropy": 926.0,
"step": 44,
"train/nll_loss_a": 0.12073729187250137,
"train/nll_loss_b": 0.10447523991266887,
"val/completion_length": 57.910256703694664,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410390138626,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.16666666915019354,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.24723881979783377,
"val/fraction_latin_b": 0.2573054035504659,
"val/fraction_number_a": 0.4117300808429718,
"val/fraction_number_b": 0.4151102900505066,
"val/fraction_other_a": 0.3410310943921407,
"val/fraction_other_b": 0.32758431633313495,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.0016183808135489623,
"val/lang_prob_en": 0.6595947543780009,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.46875,
"grad_norm": 0.751488208770752,
"learning_rate": 0.0001,
"loss": 0.1458,
"objective/entropy": 1064.0,
"step": 45,
"train/nll_loss_a": 0.1560243566830953,
"train/nll_loss_b": 0.13560334593057632,
"val/completion_length": 55.36538569132487,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7307692368825277,
"val/fraction_correct": 0.1858974372347196,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2510768473148346,
"val/fraction_latin_b": 0.23666884501775107,
"val/fraction_number_a": 0.39663148919741315,
"val/fraction_number_b": 0.4140782058238983,
"val/fraction_other_a": 0.35229164361953735,
"val/fraction_other_b": 0.34925296902656555,
"val/fraction_ties": 0.8333333333333334,
"val/lang_prob_bg": 0.0012542977929115295,
"val/lang_prob_en": 0.6892314950625101,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4791666666666667,
"grad_norm": 0.7010864019393921,
"learning_rate": 0.0001,
"loss": 0.1338,
"objective/entropy": 997.3333333333334,
"step": 46,
"train/nll_loss_a": 0.14200725158055624,
"train/nll_loss_b": 0.1256332869331042,
"val/completion_length": 51.71153767903646,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.1666666679084301,
"val/fraction_both_incorrect": 0.7435897390047709,
"val/fraction_correct": 0.21153846631447473,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.24885622163613638,
"val/fraction_latin_b": 0.2301209419965744,
"val/fraction_number_a": 0.396872212489446,
"val/fraction_number_b": 0.4051181972026825,
"val/fraction_other_a": 0.3542715708414714,
"val/fraction_other_b": 0.3647608856360118,
"val/fraction_ties": 0.9102564056714376,
"val/lang_prob_bg": 0.0016821225096161168,
"val/lang_prob_en": 0.6777702768643697,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.4895833333333333,
"grad_norm": 1.0925281047821045,
"learning_rate": 0.0001,
"loss": 0.1205,
"objective/entropy": 881.3333333333334,
"step": 47,
"train/nll_loss_a": 0.12804403652747473,
"train/nll_loss_b": 0.11300961673259735,
"val/completion_length": 52.36538569132487,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307978868484,
"val/fraction_both_incorrect": 0.8333333333333334,
"val/fraction_correct": 0.12179487322767575,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.23128040631612143,
"val/fraction_latin_b": 0.2396069417397181,
"val/fraction_number_a": 0.4000318944454193,
"val/fraction_number_b": 0.40589800477027893,
"val/fraction_other_a": 0.3686876992384593,
"val/fraction_other_b": 0.3544950584570567,
"val/fraction_ties": 0.9102564056714376,
"val/lang_prob_bg": 0.0015037450551365812,
"val/lang_prob_en": 0.6730888287226359,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5,
"grad_norm": 1.2894670963287354,
"learning_rate": 0.0001,
"loss": 0.1487,
"objective/entropy": 714.6666666666666,
"step": 48,
"train/nll_loss_a": 0.139557013909022,
"train/nll_loss_b": 0.15790955225626627,
"val/completion_length": 51.801282246907554,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359184503555,
"val/fraction_both_incorrect": 0.8333333134651184,
"val/fraction_correct": 0.12820513049761453,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.20845835904280344,
"val/fraction_latin_b": 0.2055131047964096,
"val/fraction_number_a": 0.428351491689682,
"val/fraction_number_b": 0.4272334774335225,
"val/fraction_other_a": 0.3631901641686757,
"val/fraction_other_b": 0.3672534426053365,
"val/fraction_ties": 0.9230769276618958,
"val/lang_prob_bg": 0.0014591465005651116,
"val/lang_prob_en": 0.6774142583211263,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5104166666666666,
"grad_norm": 1.2826536893844604,
"learning_rate": 0.0001,
"loss": 0.11,
"objective/entropy": 1096.6666666666667,
"step": 49,
"train/nll_loss_a": 0.09283561259508133,
"train/nll_loss_b": 0.1271024172504743,
"val/completion_length": 53.8012809753418,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.8461538354555765,
"val/fraction_correct": 0.10897436365485191,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2129476418097814,
"val/fraction_latin_b": 0.20714954535166422,
"val/fraction_number_a": 0.4297573169072469,
"val/fraction_number_b": 0.4259600241978963,
"val/fraction_other_a": 0.35729504625002545,
"val/fraction_other_b": 0.36689044038454693,
"val/fraction_ties": 0.9102564056714376,
"val/lang_prob_bg": 0.0016641345185538132,
"val/lang_prob_en": 0.6516953508059183,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5208333333333334,
"grad_norm": 0.9105807542800903,
"learning_rate": 0.0001,
"loss": 0.112,
"objective/entropy": 668.6666666666666,
"step": 50,
"train/nll_loss_a": 0.1011932243903478,
"train/nll_loss_b": 0.12279495596885681,
"val/completion_length": 52.903846740722656,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.012820513298114141,
"val/fraction_both_incorrect": 0.7179487347602844,
"val/fraction_correct": 0.14743590354919434,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.2198729912439982,
"val/fraction_latin_b": 0.21549966434637705,
"val/fraction_number_a": 0.4184926648934682,
"val/fraction_number_b": 0.4257381657759349,
"val/fraction_other_a": 0.36163437366485596,
"val/fraction_other_b": 0.3587621847788493,
"val/fraction_ties": 0.7307692368825277,
"val/lang_prob_bg": 0.0013077266824742158,
"val/lang_prob_en": 0.6708633701006571,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.53125,
"grad_norm": 0.6333914399147034,
"learning_rate": 0.0001,
"loss": 0.0843,
"objective/entropy": 578.6666666666666,
"step": 51,
"train/nll_loss_a": 0.08048844834168752,
"train/nll_loss_b": 0.08813040951887767,
"val/completion_length": 50.442307790120445,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.12820513173937798,
"val/fraction_both_incorrect": 0.7179487148920695,
"val/fraction_correct": 0.20512820780277252,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.1867583046356837,
"val/fraction_latin_b": 0.1920985778172811,
"val/fraction_number_a": 0.46239819129308063,
"val/fraction_number_b": 0.45383066932360333,
"val/fraction_other_a": 0.35084352890650433,
"val/fraction_other_b": 0.3540707727273305,
"val/fraction_ties": 0.8461538354555765,
"val/lang_prob_bg": 0.0013779281095291178,
"val/lang_prob_en": 0.6875834663709005,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5416666666666666,
"grad_norm": 0.597698450088501,
"learning_rate": 0.0001,
"loss": 0.0711,
"objective/entropy": 494.0,
"step": 52,
"train/nll_loss_a": 0.0706160341699918,
"train/nll_loss_b": 0.07159051423271497,
"val/completion_length": 52.0961545308431,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.7948717872301737,
"val/fraction_correct": 0.1346153865257899,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19636401534080505,
"val/fraction_latin_b": 0.19669000804424286,
"val/fraction_number_a": 0.4432801107565562,
"val/fraction_number_b": 0.4444128175576528,
"val/fraction_other_a": 0.3603558838367462,
"val/fraction_other_b": 0.3588971694310506,
"val/fraction_ties": 0.8589743574460348,
"val/lang_prob_bg": 0.0014325374116500218,
"val/lang_prob_en": 0.6964165170987447,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5520833333333334,
"grad_norm": 0.9163941144943237,
"learning_rate": 0.0001,
"loss": 0.0759,
"objective/entropy": 582.6666666666666,
"step": 53,
"train/nll_loss_a": 0.061987257252136864,
"train/nll_loss_b": 0.0898251583178838,
"val/completion_length": 51.95512771606445,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.871794859568278,
"val/fraction_correct": 0.10256410390138626,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19930331905682883,
"val/fraction_latin_b": 0.20022966961065927,
"val/fraction_number_a": 0.4515662391980489,
"val/fraction_number_b": 0.4426102936267853,
"val/fraction_other_a": 0.34913045167922974,
"val/fraction_other_b": 0.35716002186139423,
"val/fraction_ties": 0.9487179319063822,
"val/lang_prob_bg": 0.0013443352266525228,
"val/lang_prob_en": 0.6959804097811381,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5625,
"grad_norm": 0.7437398433685303,
"learning_rate": 0.0001,
"loss": 0.0617,
"objective/entropy": 917.3333333333334,
"step": 54,
"train/nll_loss_a": 0.06697492549816768,
"train/nll_loss_b": 0.05643160889546076,
"val/completion_length": 54.410255432128906,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.807692309220632,
"val/fraction_correct": 0.12820513298114142,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.1963134010632833,
"val/fraction_latin_b": 0.19755952060222626,
"val/fraction_number_a": 0.4544989267985026,
"val/fraction_number_b": 0.4627720316251119,
"val/fraction_other_a": 0.3491876721382141,
"val/fraction_other_b": 0.3396684726079305,
"val/fraction_ties": 0.871794859568278,
"val/lang_prob_bg": 0.001135329968140771,
"val/lang_prob_en": 0.6944870551427206,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.5729166666666666,
"grad_norm": 0.8625170588493347,
"learning_rate": 0.0001,
"loss": 0.0791,
"objective/entropy": 1185.3333333333333,
"step": 55,
"train/nll_loss_a": 0.0869336798787117,
"train/nll_loss_b": 0.0713660145799319,
"val/completion_length": 54.77564239501953,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.8333333333333334,
"val/fraction_correct": 0.1153846209247907,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19297984739144644,
"val/fraction_latin_b": 0.2026552508274714,
"val/fraction_number_a": 0.4540139436721802,
"val/fraction_number_b": 0.45077388485272724,
"val/fraction_other_a": 0.3530062139034271,
"val/fraction_other_b": 0.34657086928685504,
"val/fraction_ties": 0.8974358836809794,
"val/lang_prob_bg": 0.00151369022205472,
"val/lang_prob_en": 0.6880850593249003,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.006410256649057071,
"val/other_first_token": 0.0
},
{
"epoch": 0.5833333333333334,
"grad_norm": 1.0587146282196045,
"learning_rate": 0.0001,
"loss": 0.0684,
"objective/entropy": 1028.6666666666667,
"step": 56,
"train/nll_loss_a": 0.06075024977326393,
"train/nll_loss_b": 0.07601286098361015,
"val/completion_length": 53.96153895060221,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.10256410514314969,
"val/fraction_both_incorrect": 0.7820512851079305,
"val/fraction_correct": 0.1602564106384913,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.18348072469234467,
"val/fraction_latin_b": 0.18514560659726462,
"val/fraction_number_a": 0.45656461517016095,
"val/fraction_number_b": 0.4575365384419759,
"val/fraction_other_a": 0.35995468497276306,
"val/fraction_other_b": 0.35731785496075946,
"val/fraction_ties": 0.8846153815587362,
"val/lang_prob_bg": 0.00106729429292803,
"val/lang_prob_en": 0.6962061325709025,
"val/latin_first_token": 0.9935897390047709,
"val/number_first_token": 0.006410256649057071,
"val/other_first_token": 0.0
},
{
"epoch": 0.59375,
"grad_norm": 0.8600048422813416,
"learning_rate": 0.0001,
"loss": 0.1015,
"objective/entropy": 1142.0,
"step": 57,
"train/nll_loss_a": 0.10414389024178188,
"train/nll_loss_b": 0.09876606116692226,
"val/completion_length": 54.6025644938151,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307730515797,
"val/fraction_both_incorrect": 0.807692309220632,
"val/fraction_correct": 0.13461538776755333,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19213247299194336,
"val/fraction_latin_b": 0.18131321668624878,
"val/fraction_number_a": 0.44502533475557965,
"val/fraction_number_b": 0.45092181364695233,
"val/fraction_other_a": 0.36284218231836957,
"val/fraction_other_b": 0.36776500940322876,
"val/fraction_ties": 0.8846153616905212,
"val/lang_prob_bg": 0.001513439230620861,
"val/lang_prob_en": 0.702368974685669,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6041666666666666,
"grad_norm": 0.8079760074615479,
"learning_rate": 0.0001,
"loss": 0.0778,
"objective/entropy": 505.3333333333333,
"step": 58,
"train/nll_loss_a": 0.09139975905418396,
"train/nll_loss_b": 0.06421066199739774,
"val/completion_length": 46.98076883951823,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.14102564752101898,
"val/fraction_both_incorrect": 0.7051282127698263,
"val/fraction_correct": 0.2179487223426501,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.20126528044541678,
"val/fraction_latin_b": 0.19763841231664023,
"val/fraction_number_a": 0.4304538468519847,
"val/fraction_number_b": 0.43739889065424603,
"val/fraction_other_a": 0.3682809074719747,
"val/fraction_other_b": 0.3649626870950063,
"val/fraction_ties": 0.8461538553237915,
"val/lang_prob_bg": 0.0011517573924114306,
"val/lang_prob_en": 0.709509551525116,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6145833333333334,
"grad_norm": 1.1331056356430054,
"learning_rate": 0.0001,
"loss": 0.128,
"objective/entropy": 633.3333333333334,
"step": 59,
"train/nll_loss_a": 0.1187543123960495,
"train/nll_loss_b": 0.1372635985414187,
"val/completion_length": 46.737178802490234,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.08974359184503555,
"val/fraction_both_incorrect": 0.7307692170143127,
"val/fraction_correct": 0.1794871836900711,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.1910944233338038,
"val/fraction_latin_b": 0.18430300056934357,
"val/fraction_number_a": 0.4224574863910675,
"val/fraction_number_b": 0.439374307791392,
"val/fraction_other_a": 0.38644809524218243,
"val/fraction_other_b": 0.3763226866722107,
"val/fraction_ties": 0.8205128113428751,
"val/lang_prob_bg": 0.0015292856842279434,
"val/lang_prob_en": 0.6743942896525065,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.625,
"grad_norm": 0.8633061647415161,
"learning_rate": 0.0001,
"loss": 0.0539,
"objective/entropy": 554.0,
"step": 60,
"train/nll_loss_a": 0.05499819417794546,
"train/nll_loss_b": 0.05279202883442243,
"val/completion_length": 44.160256703694664,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.07692307854692142,
"val/fraction_both_incorrect": 0.8461538354555765,
"val/fraction_correct": 0.11538461844126384,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19793511927127838,
"val/fraction_latin_b": 0.19460705916086832,
"val/fraction_number_a": 0.4268949230511983,
"val/fraction_number_b": 0.4235563079516093,
"val/fraction_other_a": 0.375169962644577,
"val/fraction_other_b": 0.3818366428216298,
"val/fraction_ties": 0.9230769077936808,
"val/lang_prob_bg": 0.0016635601253559191,
"val/lang_prob_en": 0.6812194387118021,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6354166666666666,
"grad_norm": 0.8153233528137207,
"learning_rate": 0.0001,
"loss": 0.0809,
"objective/entropy": 630.6666666666666,
"step": 61,
"train/nll_loss_a": 0.09423964222272237,
"train/nll_loss_b": 0.06749718139568965,
"val/completion_length": 48.903846740722656,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.15384615709384283,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.19230769574642181,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.18816453218460083,
"val/fraction_latin_b": 0.18853654464085898,
"val/fraction_number_a": 0.45722946524620056,
"val/fraction_number_b": 0.43430561820665997,
"val/fraction_other_a": 0.3546060423056285,
"val/fraction_other_b": 0.37715784708658856,
"val/fraction_ties": 0.9230769276618958,
"val/lang_prob_bg": 0.0015237585175782442,
"val/lang_prob_en": 0.6806376179059347,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6458333333333334,
"grad_norm": 1.104749321937561,
"learning_rate": 0.0001,
"loss": 0.0912,
"objective/entropy": 1110.0,
"step": 62,
"train/nll_loss_a": 0.09771117568016052,
"train/nll_loss_b": 0.08477205038070679,
"val/completion_length": 50.39743677775065,
"val/contain_eos_token": 0.9935897390047709,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.05128205195069313,
"val/fraction_both_incorrect": 0.807692289352417,
"val/fraction_correct": 0.1217948744694392,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19005567828814188,
"val/fraction_latin_b": 0.18684939543406168,
"val/fraction_number_a": 0.44661805033683777,
"val/fraction_number_b": 0.45144979159037274,
"val/fraction_other_a": 0.3633263309796651,
"val/fraction_other_b": 0.3617008129755656,
"val/fraction_ties": 0.8589743574460348,
"val/lang_prob_bg": 0.0012832956854254007,
"val/lang_prob_en": 0.690887967745463,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.65625,
"grad_norm": 1.0379236936569214,
"learning_rate": 0.0001,
"loss": 0.0724,
"objective/entropy": 648.0,
"step": 63,
"train/nll_loss_a": 0.06959323212504387,
"train/nll_loss_b": 0.07520159830649693,
"val/completion_length": 46.92948786417643,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256524880727,
"val/fraction_both_incorrect": 0.8846153815587362,
"val/fraction_correct": 0.08974359060327212,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.19109234710534415,
"val/fraction_latin_b": 0.2014519323905309,
"val/fraction_number_a": 0.4389280279477437,
"val/fraction_number_b": 0.4367695450782776,
"val/fraction_other_a": 0.36997965971628827,
"val/fraction_other_b": 0.36177852749824524,
"val/fraction_ties": 0.9487179517745972,
"val/lang_prob_bg": 0.0012157799016373854,
"val/lang_prob_en": 0.6699715455373129,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.8731870651245117,
"learning_rate": 0.0001,
"loss": 0.0729,
"objective/entropy": 430.6666666666667,
"step": 64,
"train/nll_loss_a": 0.08839354167381923,
"train/nll_loss_b": 0.05736600855986277,
"val/completion_length": 46.35897445678711,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.05128205195069313,
"val/fraction_both_incorrect": 0.7435897390047709,
"val/fraction_correct": 0.1538461558520794,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.18153652052084604,
"val/fraction_latin_b": 0.1694901337226232,
"val/fraction_number_a": 0.45514161388079327,
"val/fraction_number_b": 0.47041670481363934,
"val/fraction_other_a": 0.36332186063130695,
"val/fraction_other_b": 0.3600931664307912,
"val/fraction_ties": 0.7948717872301737,
"val/lang_prob_bg": 0.0013296857941895723,
"val/lang_prob_en": 0.6832193930943807,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
},
{
"epoch": 0.6770833333333334,
"grad_norm": 0.8836628198623657,
"learning_rate": 0.0001,
"loss": 0.0866,
"objective/entropy": 508.0,
"step": 65,
"train/nll_loss_a": 0.0954609215259552,
"train/nll_loss_b": 0.07772823919852574,
"val/completion_length": 46.75640996297201,
"val/contain_eos_token": 1.0,
"val/contains_guillemets": 0.0,
"val/cyrillic_first_token": 0.0,
"val/empty_batch": 0.0,
"val/fraction_both_correct": 0.06410256649057071,
"val/fraction_both_incorrect": 0.7692307631174723,
"val/fraction_correct": 0.14743590106566748,
"val/fraction_cyrillic_a": 0.0,
"val/fraction_cyrillic_b": 0.0,
"val/fraction_latin_a": 0.18177295724550882,
"val/fraction_latin_b": 0.18375508983929953,
"val/fraction_number_a": 0.44039592146873474,
"val/fraction_number_b": 0.45297469695409137,
"val/fraction_other_a": 0.377831111351649,
"val/fraction_other_b": 0.36327023307482403,
"val/fraction_ties": 0.8333333333333334,
"val/lang_prob_bg": 0.0013483318810661633,
"val/lang_prob_en": 0.6739258567492167,
"val/latin_first_token": 1.0,
"val/number_first_token": 0.0,
"val/other_first_token": 0.0
}
],
"logging_steps": 1,
"max_steps": 96,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 26,
"trial_name": null,
"trial_params": null
}