{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15625, "eval_steps": 500, "global_step": 15, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010416666666666666, "grad_norm": 0.46898284554481506, "learning_rate": 0.0001, "loss": 0.4984, "objective/entropy": 1536.0, "step": 1, "train/nll_loss_a": 0.46235302090644836, "train/nll_loss_b": 0.5343712766965231, "val/completion_length": 141.19872029622397, "val/contain_eos_token": 0.9294871687889099, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.35256410638491315, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.03846153989434242, "val/fraction_both_incorrect": 0.628205140431722, "val/fraction_correct": 0.20512820780277252, "val/fraction_cyrillic_a": 0.09817602237065633, "val/fraction_cyrillic_b": 0.07149943461020787, "val/fraction_latin_a": 0.48449812332789105, "val/fraction_latin_b": 0.5023942093054453, "val/fraction_number_a": 0.18907449146111807, "val/fraction_number_b": 0.19445918997128805, "val/fraction_other_a": 0.22825137277444205, "val/fraction_other_b": 0.2316471884648005, "val/fraction_ties": 0.6666666666666666, "val/lang_prob_bg": 0.0268978967020909, "val/lang_prob_en": 0.6749410231908163, "val/latin_first_token": 0.6474358836809794, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.020833333333333332, "grad_norm": 0.49648183584213257, "learning_rate": 0.0001, "loss": 0.4431, "objective/entropy": 1424.0, "step": 2, "train/nll_loss_a": 0.4012756248315175, "train/nll_loss_b": 0.4849816660086314, "val/completion_length": 138.66666666666666, "val/contain_eos_token": 0.9102564056714376, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.32692308227221173, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.06410256524880727, "val/fraction_both_incorrect": 0.7179487148920695, "val/fraction_correct": 0.1730769251783689, "val/fraction_cyrillic_a": 0.0820654605825742, "val/fraction_cyrillic_b": 0.06807015091180801, "val/fraction_latin_a": 0.46912774443626404, "val/fraction_latin_b": 0.4876726269721985, "val/fraction_number_a": 0.21081160008907318, "val/fraction_number_b": 0.2032010406255722, "val/fraction_other_a": 0.2379952073097229, "val/fraction_other_b": 0.24105618397394815, "val/fraction_ties": 0.7820512851079305, "val/lang_prob_bg": 0.03282865695655346, "val/lang_prob_en": 0.6723186572392782, "val/latin_first_token": 0.6666666666666666, "val/number_first_token": 0.0, "val/other_first_token": 0.006410256649057071 }, { "epoch": 0.03125, "grad_norm": 0.6362881660461426, "learning_rate": 0.0001, "loss": 0.505, "objective/entropy": 1469.3333333333333, "step": 3, "train/nll_loss_a": 0.41949082414309186, "train/nll_loss_b": 0.5905094941457113, "val/completion_length": 153.07691955566406, "val/contain_eos_token": 0.878205140431722, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.3076923092206319, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.03846153989434242, "val/fraction_both_incorrect": 0.7820512851079305, "val/fraction_correct": 0.12820513298114142, "val/fraction_cyrillic_a": 0.09064680337905884, "val/fraction_cyrillic_b": 0.07422023018201192, "val/fraction_latin_a": 0.4563806454340617, "val/fraction_latin_b": 0.4618365466594696, "val/fraction_number_a": 0.21234740813573202, "val/fraction_number_b": 0.21217785278956094, "val/fraction_other_a": 0.24062515298525491, "val/fraction_other_b": 0.2517653902371724, "val/fraction_ties": 0.8205128312110901, "val/lang_prob_bg": 0.03160186484456062, "val/lang_prob_en": 0.6696631709734598, "val/latin_first_token": 0.6923076709111532, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.041666666666666664, "grad_norm": 0.5365626215934753, "learning_rate": 0.0001, "loss": 0.3568, "objective/entropy": 1538.6666666666667, "step": 4, "train/nll_loss_a": 0.3584041992823283, "train/nll_loss_b": 0.3552741805712382, "val/completion_length": 139.39102172851562, "val/contain_eos_token": 0.9230769077936808, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.21794872482617697, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.08974359060327212, "val/fraction_both_incorrect": 0.7692307631174723, "val/fraction_correct": 0.1602564131220182, "val/fraction_cyrillic_a": 0.06686830148100853, "val/fraction_cyrillic_b": 0.040630811204512916, "val/fraction_latin_a": 0.48262248436609906, "val/fraction_latin_b": 0.4933239420255025, "val/fraction_number_a": 0.20719597240289053, "val/fraction_number_b": 0.22095757722854614, "val/fraction_other_a": 0.2433132529258728, "val/fraction_other_b": 0.24508768320083618, "val/fraction_ties": 0.8589743574460348, "val/lang_prob_bg": 0.02131816806892554, "val/lang_prob_en": 0.6912566820780436, "val/latin_first_token": 0.7820512851079305, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.052083333333333336, "grad_norm": 0.6317083835601807, "learning_rate": 0.0001, "loss": 0.3542, "objective/entropy": 1616.0, "step": 5, "train/nll_loss_a": 0.3730636735757192, "train/nll_loss_b": 0.33537689844767254, "val/completion_length": 141.45512898763022, "val/contain_eos_token": 0.9230769276618958, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.03846153927346071, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.10256410514314969, "val/fraction_both_incorrect": 0.7692307829856873, "val/fraction_correct": 0.16666666915019354, "val/fraction_cyrillic_a": 0.018022120309372742, "val/fraction_cyrillic_b": 0.008442190863812963, "val/fraction_latin_a": 0.530072808265686, "val/fraction_latin_b": 0.5371540983517965, "val/fraction_number_a": 0.20419377585252127, "val/fraction_number_b": 0.19861711064974466, "val/fraction_other_a": 0.2477113058169683, "val/fraction_other_b": 0.25578661759694415, "val/fraction_ties": 0.871794859568278, "val/lang_prob_bg": 0.005829914240166545, "val/lang_prob_en": 0.6994746724764506, "val/latin_first_token": 0.9615384538968405, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.0625, "grad_norm": 0.6112334728240967, "learning_rate": 0.0001, "loss": 0.3526, "objective/entropy": 1450.6666666666667, "step": 6, "train/nll_loss_a": 0.3616310755411784, "train/nll_loss_b": 0.3435203830401103, "val/completion_length": 139.3397420247396, "val/contain_eos_token": 0.942307690779368, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.012820513298114141, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.07692307978868484, "val/fraction_both_incorrect": 0.7564102411270142, "val/fraction_correct": 0.1602564131220182, "val/fraction_cyrillic_a": 0.005665303532926676, "val/fraction_cyrillic_b": 0.0012210012258340914, "val/fraction_latin_a": 0.5337207714716593, "val/fraction_latin_b": 0.5389339327812195, "val/fraction_number_a": 0.20246068636576334, "val/fraction_number_b": 0.20605232814947763, "val/fraction_other_a": 0.2581532299518585, "val/fraction_other_b": 0.25379273295402527, "val/fraction_ties": 0.8333333134651184, "val/lang_prob_bg": 0.0024220591488604746, "val/lang_prob_en": 0.716150164604187, "val/latin_first_token": 0.9871794780095419, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.07291666666666667, "grad_norm": 0.5834570527076721, "learning_rate": 0.0001, "loss": 0.3122, "objective/entropy": 1482.6666666666667, "step": 7, "train/nll_loss_a": 0.31628555059432983, "train/nll_loss_b": 0.3082062304019928, "val/completion_length": 140.28205362955728, "val/contain_eos_token": 0.935897429784139, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.012820513298114141, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.051282053192456566, "val/fraction_both_incorrect": 0.7948717872301737, "val/fraction_correct": 0.12820513298114142, "val/fraction_cyrillic_a": 0.004876403972351302, "val/fraction_cyrillic_b": 0.0055291604561110335, "val/fraction_latin_a": 0.51544189453125, "val/fraction_latin_b": 0.5175811052322388, "val/fraction_number_a": 0.21839049458503723, "val/fraction_number_b": 0.21114349365234375, "val/fraction_other_a": 0.2612912356853485, "val/fraction_other_b": 0.26574622591336566, "val/fraction_ties": 0.8461538553237915, "val/lang_prob_bg": 0.0024805181116486588, "val/lang_prob_en": 0.7189218997955322, "val/latin_first_token": 0.9871794780095419, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.08333333333333333, "grad_norm": 0.5234330296516418, "learning_rate": 0.0001, "loss": 0.3689, "objective/entropy": 1560.0, "step": 8, "train/nll_loss_a": 0.38403966029485065, "train/nll_loss_b": 0.353829691807429, "val/completion_length": 147.18589782714844, "val/contain_eos_token": 0.8910256226857504, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.032051283245285354, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.12820513173937798, "val/fraction_both_incorrect": 0.6794871886571249, "val/fraction_correct": 0.22435897588729858, "val/fraction_cyrillic_a": 0.005463951422522466, "val/fraction_cyrillic_b": 0.00364190728093187, "val/fraction_latin_a": 0.519624650478363, "val/fraction_latin_b": 0.5310182571411133, "val/fraction_number_a": 0.22126641869544983, "val/fraction_number_b": 0.20790701607863107, "val/fraction_other_a": 0.2536449631055196, "val/fraction_other_b": 0.25743279854456586, "val/fraction_ties": 0.807692309220632, "val/lang_prob_bg": 0.0022993393164748945, "val/lang_prob_en": 0.7228630383809408, "val/latin_first_token": 0.9679486950238546, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.09375, "grad_norm": 0.46253740787506104, "learning_rate": 0.0001, "loss": 0.3548, "objective/entropy": 1626.6666666666667, "step": 9, "train/nll_loss_a": 0.353506733973821, "train/nll_loss_b": 0.3560173710187276, "val/completion_length": 141.10897318522134, "val/contain_eos_token": 0.8974359035491943, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.03846153927346071, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.17948718120654425, "val/fraction_both_incorrect": 0.6666666865348816, "val/fraction_correct": 0.25641026099522907, "val/fraction_cyrillic_a": 0.009551782781879107, "val/fraction_cyrillic_b": 0.007778597995638847, "val/fraction_latin_a": 0.5005057454109192, "val/fraction_latin_b": 0.5112853447596232, "val/fraction_number_a": 0.2224580099185308, "val/fraction_number_b": 0.2299031764268875, "val/fraction_other_a": 0.26748446623484295, "val/fraction_other_b": 0.25103287398815155, "val/fraction_ties": 0.8461538354555765, "val/lang_prob_bg": 0.003838104816774527, "val/lang_prob_en": 0.7203066547711691, "val/latin_first_token": 0.9615384538968405, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.10416666666666667, "grad_norm": 0.6793639063835144, "learning_rate": 0.0001, "loss": 0.4072, "objective/entropy": 1544.0, "step": 10, "train/nll_loss_a": 0.40951302647590637, "train/nll_loss_b": 0.4048899710178375, "val/completion_length": 139.59615580240884, "val/contain_eos_token": 0.9230769276618958, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.044871795922517776, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.10256410514314969, "val/fraction_both_incorrect": 0.7564102609952291, "val/fraction_correct": 0.1730769251783689, "val/fraction_cyrillic_a": 0.01814807563399275, "val/fraction_cyrillic_b": 0.01091132735988746, "val/fraction_latin_a": 0.479096124569575, "val/fraction_latin_b": 0.48450469970703125, "val/fraction_number_a": 0.24191749095916748, "val/fraction_number_b": 0.23329021533330283, "val/fraction_other_a": 0.2608383099238078, "val/fraction_other_b": 0.2712937593460083, "val/fraction_ties": 0.8589743375778198, "val/lang_prob_bg": 0.005877171643078327, "val/lang_prob_en": 0.6905626058578491, "val/latin_first_token": 0.9551281929016113, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.11458333333333333, "grad_norm": 0.6676069498062134, "learning_rate": 0.0001, "loss": 0.3261, "objective/entropy": 1296.0, "step": 11, "train/nll_loss_a": 0.3414422770341237, "train/nll_loss_b": 0.31081566711266834, "val/completion_length": 133.35897318522134, "val/contain_eos_token": 0.9615384538968405, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.012820513298114141, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.07692307854692142, "val/fraction_both_incorrect": 0.7692307631174723, "val/fraction_correct": 0.15384616081913313, "val/fraction_cyrillic_a": 0.009972451565166315, "val/fraction_cyrillic_b": 0.008257918680707613, "val/fraction_latin_a": 0.4708147446314494, "val/fraction_latin_b": 0.4841614067554474, "val/fraction_number_a": 0.2511301040649414, "val/fraction_number_b": 0.23997685313224792, "val/fraction_other_a": 0.26808270812034607, "val/fraction_other_b": 0.2676038245360057, "val/fraction_ties": 0.8461538354555765, "val/lang_prob_bg": 0.004965859581716359, "val/lang_prob_en": 0.7087472081184387, "val/latin_first_token": 0.9871794780095419, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.125, "grad_norm": 0.48878854513168335, "learning_rate": 0.0001, "loss": 0.3202, "objective/entropy": 1520.0, "step": 12, "train/nll_loss_a": 0.3013697862625122, "train/nll_loss_b": 0.33911073207855225, "val/completion_length": 125.4551289876302, "val/contain_eos_token": 0.9551282127698263, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.01923076994717121, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.07692307854692142, "val/fraction_both_incorrect": 0.6410256425539652, "val/fraction_correct": 0.21794872482617697, "val/fraction_cyrillic_a": 0.0016106078983284533, "val/fraction_cyrillic_b": 0.00164324635018905, "val/fraction_latin_a": 0.4850431780020396, "val/fraction_latin_b": 0.49556367595990497, "val/fraction_number_a": 0.24694832662741342, "val/fraction_number_b": 0.24452554682890573, "val/fraction_other_a": 0.26639790336290997, "val/fraction_other_b": 0.25826754172643024, "val/fraction_ties": 0.7179487347602844, "val/lang_prob_bg": 0.0020299581810832024, "val/lang_prob_en": 0.7036298712094625, "val/latin_first_token": 0.9807692170143127, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.13541666666666666, "grad_norm": 0.47080346941947937, "learning_rate": 0.0001, "loss": 0.3551, "objective/entropy": 1610.6666666666667, "step": 13, "train/nll_loss_a": 0.3537709911664327, "train/nll_loss_b": 0.35645443201065063, "val/completion_length": 132.8397471110026, "val/contain_eos_token": 0.942307690779368, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.01923076994717121, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.11538461844126384, "val/fraction_both_incorrect": 0.6410256425539652, "val/fraction_correct": 0.23717949291070303, "val/fraction_cyrillic_a": 0.0035886759869754314, "val/fraction_cyrillic_b": 0.002304682352890571, "val/fraction_latin_a": 0.5025050441424052, "val/fraction_latin_b": 0.49992923935254413, "val/fraction_number_a": 0.22844381630420685, "val/fraction_number_b": 0.23804503679275513, "val/fraction_other_a": 0.2654624879360199, "val/fraction_other_b": 0.2597210705280304, "val/fraction_ties": 0.7564102609952291, "val/lang_prob_bg": 0.0021097887850676975, "val/lang_prob_en": 0.7135748863220215, "val/latin_first_token": 0.9807692170143127, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.14583333333333334, "grad_norm": 0.5261266827583313, "learning_rate": 0.0001, "loss": 0.3553, "objective/entropy": 1768.0, "step": 14, "train/nll_loss_a": 0.36536062757174176, "train/nll_loss_b": 0.34532251954078674, "val/completion_length": 130.3205134073893, "val/contain_eos_token": 0.942307690779368, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.006410256649057071, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.14102564255396524, "val/fraction_both_incorrect": 0.7435897588729858, "val/fraction_correct": 0.19871795177459717, "val/fraction_cyrillic_a": 0.0008547008813669285, "val/fraction_cyrillic_b": 0.0002670940205765267, "val/fraction_latin_a": 0.48295870423316956, "val/fraction_latin_b": 0.4754104216893514, "val/fraction_number_a": 0.24797451992829642, "val/fraction_number_b": 0.25482123096783954, "val/fraction_other_a": 0.26821208000183105, "val/fraction_other_b": 0.269501268863678, "val/fraction_ties": 0.8846153616905212, "val/lang_prob_bg": 0.0016354583591843646, "val/lang_prob_en": 0.7125194072723389, "val/latin_first_token": 0.9935897390047709, "val/number_first_token": 0.0, "val/other_first_token": 0.0 }, { "epoch": 0.15625, "grad_norm": 0.5608013868331909, "learning_rate": 0.0001, "loss": 0.375, "objective/entropy": 1813.3333333333333, "step": 15, "train/nll_loss_a": 0.3743097384770711, "train/nll_loss_b": 0.3756645123163859, "val/completion_length": 113.73076883951823, "val/contain_eos_token": 0.942307690779368, "val/contains_guillemets": 0.0, "val/cyrillic_first_token": 0.0, "val/empty_batch": 0.0, "val/fraction_both_correct": 0.10256410514314969, "val/fraction_both_incorrect": 0.6666666666666666, "val/fraction_correct": 0.21794872482617697, "val/fraction_cyrillic_a": 0.0005128205132981142, "val/fraction_cyrillic_b": 0.0, "val/fraction_latin_a": 0.4844670593738556, "val/fraction_latin_b": 0.48018120725949603, "val/fraction_number_a": 0.24094298481941223, "val/fraction_number_b": 0.2496140201886495, "val/fraction_other_a": 0.2740771571795146, "val/fraction_other_b": 0.2702048122882843, "val/fraction_ties": 0.7692307829856873, "val/lang_prob_bg": 0.0012819842668250203, "val/lang_prob_en": 0.712844451268514, "val/latin_first_token": 1.0, "val/number_first_token": 0.0, "val/other_first_token": 0.0 } ], "logging_steps": 1, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 26, "trial_name": null, "trial_params": null }