{ "best_metric": 0.8299749701259963, "best_model_checkpoint": "output/marbert_simce_EuroBERT-EuroBERT-610M_16_bs_1_e/checkpoint-25000", "epoch": 0.8139208532135841, "eval_steps": 500, "global_step": 29000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005613247263541959, "grad_norm": 98309536.0, "learning_rate": 2.8066236317709794e-06, "loss": 7.8472, "step": 200 }, { "epoch": 0.011226494527083918, "grad_norm": 18181398.0, "learning_rate": 5.613247263541959e-06, "loss": 1.8133, "step": 400 }, { "epoch": 0.014033118158854897, "eval_loss": 4.332894802093506, "eval_runtime": 80.7749, "eval_samples_per_second": 81.82, "eval_sequential_score": 0.8033767514947983, "eval_steps_per_second": 1.288, "eval_sts-dev-1152_pearson_cosine": 0.8060511562516744, "eval_sts-dev-1152_spearman_cosine": 0.8033767514947983, "eval_sts-dev-512_pearson_cosine": 0.8032161700580676, "eval_sts-dev-512_spearman_cosine": 0.8009572819138866, "eval_sts-dev-768_pearson_cosine": 0.8037436235908193, "eval_sts-dev-768_spearman_cosine": 0.8010140475354219, "eval_sts-dev-960_pearson_cosine": 0.8052857547543548, "eval_sts-dev-960_spearman_cosine": 0.8024252040181402, "step": 500 }, { "epoch": 0.016839741790625876, "grad_norm": 4002785.75, "learning_rate": 8.41987089531294e-06, "loss": 1.2257, "step": 600 }, { "epoch": 0.022452989054167836, "grad_norm": 14975067.0, "learning_rate": 1.1226494527083917e-05, "loss": 1.0662, "step": 800 }, { "epoch": 0.028066236317709794, "grad_norm": 8259098.0, "learning_rate": 1.4033118158854899e-05, "loss": 1.0452, "step": 1000 }, { "epoch": 0.028066236317709794, "eval_loss": 4.0553975105285645, "eval_runtime": 83.8854, "eval_samples_per_second": 78.786, "eval_sequential_score": 0.8152056130630069, "eval_steps_per_second": 1.24, "eval_sts-dev-1152_pearson_cosine": 0.8162027650076662, "eval_sts-dev-1152_spearman_cosine": 0.8152056130630069, "eval_sts-dev-512_pearson_cosine": 0.8109347704147131, "eval_sts-dev-512_spearman_cosine": 0.811300104502657, "eval_sts-dev-768_pearson_cosine": 0.8133183334539484, "eval_sts-dev-768_spearman_cosine": 0.8130045482521145, "eval_sts-dev-960_pearson_cosine": 0.8156636692837823, "eval_sts-dev-960_spearman_cosine": 0.814950405147375, "step": 1000 }, { "epoch": 0.03367948358125175, "grad_norm": 4874028.5, "learning_rate": 1.683974179062588e-05, "loss": 1.0306, "step": 1200 }, { "epoch": 0.03929273084479371, "grad_norm": 4182119.25, "learning_rate": 1.9646365422396855e-05, "loss": 1.0223, "step": 1400 }, { "epoch": 0.042099354476564696, "eval_loss": 4.015018939971924, "eval_runtime": 84.6437, "eval_samples_per_second": 78.08, "eval_sequential_score": 0.7963919935425935, "eval_steps_per_second": 1.229, "eval_sts-dev-1152_pearson_cosine": 0.7984721704385993, "eval_sts-dev-1152_spearman_cosine": 0.7963919935425935, "eval_sts-dev-512_pearson_cosine": 0.7945396929823907, "eval_sts-dev-512_spearman_cosine": 0.7938897942194647, "eval_sts-dev-768_pearson_cosine": 0.796786289376193, "eval_sts-dev-768_spearman_cosine": 0.7957412002954445, "eval_sts-dev-960_pearson_cosine": 0.7984310473849502, "eval_sts-dev-960_spearman_cosine": 0.7964965649245895, "step": 1500 }, { "epoch": 0.04490597810833567, "grad_norm": 4491974.5, "learning_rate": 2.2452989054167835e-05, "loss": 0.9923, "step": 1600 }, { "epoch": 0.050519225371877634, "grad_norm": 2410247.5, "learning_rate": 2.5259612685938815e-05, "loss": 1.007, "step": 1800 }, { "epoch": 0.05613247263541959, "grad_norm": 3212723.25, "learning_rate": 2.8066236317709798e-05, "loss": 0.9898, "step": 2000 }, { "epoch": 0.05613247263541959, "eval_loss": 4.562768459320068, "eval_runtime": 84.3142, "eval_samples_per_second": 78.385, "eval_sequential_score": 0.7838450146591204, "eval_steps_per_second": 1.233, "eval_sts-dev-1152_pearson_cosine": 0.7765908851768635, "eval_sts-dev-1152_spearman_cosine": 0.7838450146591204, "eval_sts-dev-512_pearson_cosine": 0.7672097671946088, "eval_sts-dev-512_spearman_cosine": 0.7778618805232163, "eval_sts-dev-768_pearson_cosine": 0.7696489245819802, "eval_sts-dev-768_spearman_cosine": 0.7793358424223233, "eval_sts-dev-960_pearson_cosine": 0.7741670009813553, "eval_sts-dev-960_spearman_cosine": 0.7829158933416259, "step": 2000 }, { "epoch": 0.06174571989896155, "grad_norm": 1835005.25, "learning_rate": 3.087285994948078e-05, "loss": 1.033, "step": 2200 }, { "epoch": 0.0673589671625035, "grad_norm": 2749047.0, "learning_rate": 3.367948358125176e-05, "loss": 1.0091, "step": 2400 }, { "epoch": 0.0701655907942745, "eval_loss": 4.613296985626221, "eval_runtime": 81.7193, "eval_samples_per_second": 80.874, "eval_sequential_score": 0.7854737360223817, "eval_steps_per_second": 1.273, "eval_sts-dev-1152_pearson_cosine": 0.7821361682028354, "eval_sts-dev-1152_spearman_cosine": 0.7854737360223817, "eval_sts-dev-512_pearson_cosine": 0.7723481989731885, "eval_sts-dev-512_spearman_cosine": 0.7767002410536074, "eval_sts-dev-768_pearson_cosine": 0.7752960500176977, "eval_sts-dev-768_spearman_cosine": 0.7801463868858681, "eval_sts-dev-960_pearson_cosine": 0.779209273182729, "eval_sts-dev-960_spearman_cosine": 0.7833584025542436, "step": 2500 }, { "epoch": 0.07297221442604547, "grad_norm": 2484525.5, "learning_rate": 3.648610721302274e-05, "loss": 1.046, "step": 2600 }, { "epoch": 0.07858546168958742, "grad_norm": 2774297.5, "learning_rate": 3.929273084479371e-05, "loss": 1.0212, "step": 2800 }, { "epoch": 0.08419870895312939, "grad_norm": 3957846.75, "learning_rate": 4.20993544765647e-05, "loss": 1.0923, "step": 3000 }, { "epoch": 0.08419870895312939, "eval_loss": 5.038168430328369, "eval_runtime": 81.5206, "eval_samples_per_second": 81.071, "eval_sequential_score": 0.769819909731699, "eval_steps_per_second": 1.276, "eval_sts-dev-1152_pearson_cosine": 0.7639981739632917, "eval_sts-dev-1152_spearman_cosine": 0.769819909731699, "eval_sts-dev-512_pearson_cosine": 0.7557090629225184, "eval_sts-dev-512_spearman_cosine": 0.7642757401766183, "eval_sts-dev-768_pearson_cosine": 0.7573996144114894, "eval_sts-dev-768_spearman_cosine": 0.7655547584963449, "eval_sts-dev-960_pearson_cosine": 0.761317538183819, "eval_sts-dev-960_spearman_cosine": 0.7681871634317281, "step": 3000 }, { "epoch": 0.08981195621667135, "grad_norm": 2094610.625, "learning_rate": 4.490597810833567e-05, "loss": 1.0542, "step": 3200 }, { "epoch": 0.0954252034802133, "grad_norm": 1429313.5, "learning_rate": 4.7712601740106656e-05, "loss": 1.025, "step": 3400 }, { "epoch": 0.09823182711198428, "eval_loss": 4.955362319946289, "eval_runtime": 78.4429, "eval_samples_per_second": 84.252, "eval_sequential_score": 0.773224539195293, "eval_steps_per_second": 1.326, "eval_sts-dev-1152_pearson_cosine": 0.7657713900430686, "eval_sts-dev-1152_spearman_cosine": 0.773224539195293, "eval_sts-dev-512_pearson_cosine": 0.7600635376400412, "eval_sts-dev-512_spearman_cosine": 0.7680689406591721, "eval_sts-dev-768_pearson_cosine": 0.7604851132447141, "eval_sts-dev-768_spearman_cosine": 0.7693172932298855, "eval_sts-dev-960_pearson_cosine": 0.7631668495431749, "eval_sts-dev-960_spearman_cosine": 0.7712486101715305, "step": 3500 }, { "epoch": 0.10103845074375527, "grad_norm": 993321.0, "learning_rate": 4.99423082920136e-05, "loss": 1.0056, "step": 3600 }, { "epoch": 0.10665169800729722, "grad_norm": 13974978.0, "learning_rate": 4.963046122181682e-05, "loss": 1.0689, "step": 3800 }, { "epoch": 0.11226494527083918, "grad_norm": 6447055.0, "learning_rate": 4.931861415162005e-05, "loss": 1.0453, "step": 4000 }, { "epoch": 0.11226494527083918, "eval_loss": 5.834151268005371, "eval_runtime": 77.2575, "eval_samples_per_second": 85.545, "eval_sequential_score": 0.7656099368624604, "eval_steps_per_second": 1.346, "eval_sts-dev-1152_pearson_cosine": 0.7552591908965304, "eval_sts-dev-1152_spearman_cosine": 0.7656099368624604, "eval_sts-dev-512_pearson_cosine": 0.7502607892289657, "eval_sts-dev-512_spearman_cosine": 0.7606979870614468, "eval_sts-dev-768_pearson_cosine": 0.7475751433884098, "eval_sts-dev-768_spearman_cosine": 0.7607444598882842, "eval_sts-dev-960_pearson_cosine": 0.7529520081710266, "eval_sts-dev-960_spearman_cosine": 0.764507705472108, "step": 4000 }, { "epoch": 0.11787819253438114, "grad_norm": 1890051.125, "learning_rate": 4.9006767081423274e-05, "loss": 1.0874, "step": 4200 }, { "epoch": 0.1234914397979231, "grad_norm": 1684058.0, "learning_rate": 4.8694920011226495e-05, "loss": 1.0051, "step": 4400 }, { "epoch": 0.12629806342969407, "eval_loss": 5.076698303222656, "eval_runtime": 79.9271, "eval_samples_per_second": 82.688, "eval_sequential_score": 0.7752075113901122, "eval_steps_per_second": 1.301, "eval_sts-dev-1152_pearson_cosine": 0.7719939114671934, "eval_sts-dev-1152_spearman_cosine": 0.7752075113901122, "eval_sts-dev-512_pearson_cosine": 0.7652586024520893, "eval_sts-dev-512_spearman_cosine": 0.7706670879702195, "eval_sts-dev-768_pearson_cosine": 0.7663537315286835, "eval_sts-dev-768_spearman_cosine": 0.771176148682848, "eval_sts-dev-960_pearson_cosine": 0.7691859699812915, "eval_sts-dev-960_spearman_cosine": 0.7728489604857174, "step": 4500 }, { "epoch": 0.12910468706146505, "grad_norm": 1788628.75, "learning_rate": 4.838307294102972e-05, "loss": 1.0007, "step": 4600 }, { "epoch": 0.134717934325007, "grad_norm": 1246448.375, "learning_rate": 4.807122587083295e-05, "loss": 0.9307, "step": 4800 }, { "epoch": 0.140331181588549, "grad_norm": 1078949.875, "learning_rate": 4.775937880063617e-05, "loss": 0.9642, "step": 5000 }, { "epoch": 0.140331181588549, "eval_loss": 5.198572158813477, "eval_runtime": 77.2033, "eval_samples_per_second": 85.605, "eval_sequential_score": 0.7683338860401957, "eval_steps_per_second": 1.347, "eval_sts-dev-1152_pearson_cosine": 0.7579755842587974, "eval_sts-dev-1152_spearman_cosine": 0.7683338860401957, "eval_sts-dev-512_pearson_cosine": 0.7497028852140948, "eval_sts-dev-512_spearman_cosine": 0.761778428268311, "eval_sts-dev-768_pearson_cosine": 0.7527948972243363, "eval_sts-dev-768_spearman_cosine": 0.7652442137002148, "eval_sts-dev-960_pearson_cosine": 0.7555515440882907, "eval_sts-dev-960_spearman_cosine": 0.7666530937388959, "step": 5000 }, { "epoch": 0.14594442885209094, "grad_norm": 1531033.125, "learning_rate": 4.744753173043939e-05, "loss": 0.9259, "step": 5200 }, { "epoch": 0.1515576761156329, "grad_norm": 2660688.5, "learning_rate": 4.713568466024262e-05, "loss": 0.8908, "step": 5400 }, { "epoch": 0.15436429974740387, "eval_loss": 5.210824966430664, "eval_runtime": 78.072, "eval_samples_per_second": 84.653, "eval_sequential_score": 0.7761786001654587, "eval_steps_per_second": 1.332, "eval_sts-dev-1152_pearson_cosine": 0.7628982988651709, "eval_sts-dev-1152_spearman_cosine": 0.7761786001654587, "eval_sts-dev-512_pearson_cosine": 0.7563657029921103, "eval_sts-dev-512_spearman_cosine": 0.7711187793718405, "eval_sts-dev-768_pearson_cosine": 0.7545503313040781, "eval_sts-dev-768_spearman_cosine": 0.7713354514018652, "eval_sts-dev-960_pearson_cosine": 0.7595387577495225, "eval_sts-dev-960_spearman_cosine": 0.774137594546881, "step": 5500 }, { "epoch": 0.15717092337917485, "grad_norm": 1493755.375, "learning_rate": 4.6823837590045846e-05, "loss": 0.8812, "step": 5600 }, { "epoch": 0.1627841706427168, "grad_norm": 1185704.125, "learning_rate": 4.651199051984907e-05, "loss": 0.8544, "step": 5800 }, { "epoch": 0.16839741790625878, "grad_norm": 1171420.375, "learning_rate": 4.6200143449652295e-05, "loss": 0.8314, "step": 6000 }, { "epoch": 0.16839741790625878, "eval_loss": 5.24008321762085, "eval_runtime": 77.1769, "eval_samples_per_second": 85.634, "eval_sequential_score": 0.7731812487805466, "eval_steps_per_second": 1.348, "eval_sts-dev-1152_pearson_cosine": 0.7629915610098019, "eval_sts-dev-1152_spearman_cosine": 0.7731812487805466, "eval_sts-dev-512_pearson_cosine": 0.7548517081547739, "eval_sts-dev-512_spearman_cosine": 0.7660070542727266, "eval_sts-dev-768_pearson_cosine": 0.7552724107119573, "eval_sts-dev-768_spearman_cosine": 0.7688482677049882, "eval_sts-dev-960_pearson_cosine": 0.7593839964325895, "eval_sts-dev-960_spearman_cosine": 0.7709090473847962, "step": 6000 }, { "epoch": 0.17401066516980074, "grad_norm": 20138014.0, "learning_rate": 4.588829637945552e-05, "loss": 0.8258, "step": 6200 }, { "epoch": 0.1796239124333427, "grad_norm": 936058.875, "learning_rate": 4.557644930925874e-05, "loss": 0.8083, "step": 6400 }, { "epoch": 0.18243053606511367, "eval_loss": 5.270838260650635, "eval_runtime": 76.7742, "eval_samples_per_second": 86.084, "eval_sequential_score": 0.7680196935178831, "eval_steps_per_second": 1.355, "eval_sts-dev-1152_pearson_cosine": 0.7542454277081971, "eval_sts-dev-1152_spearman_cosine": 0.7680196935178831, "eval_sts-dev-512_pearson_cosine": 0.7491425293129472, "eval_sts-dev-512_spearman_cosine": 0.7637887014241396, "eval_sts-dev-768_pearson_cosine": 0.7462311896234177, "eval_sts-dev-768_spearman_cosine": 0.7621924407912392, "eval_sts-dev-960_pearson_cosine": 0.7506447400007437, "eval_sts-dev-960_spearman_cosine": 0.7656016598904541, "step": 6500 }, { "epoch": 0.18523715969688465, "grad_norm": 15407098.0, "learning_rate": 4.5264602239061963e-05, "loss": 0.8373, "step": 6600 }, { "epoch": 0.1908504069604266, "grad_norm": 1347636.375, "learning_rate": 4.495275516886519e-05, "loss": 0.8031, "step": 6800 }, { "epoch": 0.19646365422396855, "grad_norm": 1346039.0, "learning_rate": 4.464090809866842e-05, "loss": 0.7375, "step": 7000 }, { "epoch": 0.19646365422396855, "eval_loss": 5.154874801635742, "eval_runtime": 77.428, "eval_samples_per_second": 85.357, "eval_sequential_score": 0.776258910277576, "eval_steps_per_second": 1.343, "eval_sts-dev-1152_pearson_cosine": 0.7608005418896666, "eval_sts-dev-1152_spearman_cosine": 0.776258910277576, "eval_sts-dev-512_pearson_cosine": 0.7589220526161604, "eval_sts-dev-512_spearman_cosine": 0.773786883290433, "eval_sts-dev-768_pearson_cosine": 0.7534825416262227, "eval_sts-dev-768_spearman_cosine": 0.7718540709899384, "eval_sts-dev-960_pearson_cosine": 0.7570206619012192, "eval_sts-dev-960_spearman_cosine": 0.7739587544161404, "step": 7000 }, { "epoch": 0.20207690148751054, "grad_norm": 1730871.5, "learning_rate": 4.432906102847164e-05, "loss": 0.743, "step": 7200 }, { "epoch": 0.2076901487510525, "grad_norm": 610056.0, "learning_rate": 4.4017213958274867e-05, "loss": 0.739, "step": 7400 }, { "epoch": 0.21049677238282347, "eval_loss": 4.818795204162598, "eval_runtime": 78.238, "eval_samples_per_second": 84.473, "eval_sequential_score": 0.7867819228719439, "eval_steps_per_second": 1.329, "eval_sts-dev-1152_pearson_cosine": 0.7777460591194046, "eval_sts-dev-1152_spearman_cosine": 0.7867819228719439, "eval_sts-dev-512_pearson_cosine": 0.7742009652343147, "eval_sts-dev-512_spearman_cosine": 0.7834916540309068, "eval_sts-dev-768_pearson_cosine": 0.771136418007053, "eval_sts-dev-768_spearman_cosine": 0.7825233109168519, "eval_sts-dev-960_pearson_cosine": 0.7749317385070862, "eval_sts-dev-960_spearman_cosine": 0.7849087778447466, "step": 7500 }, { "epoch": 0.21330339601459444, "grad_norm": 816702.6875, "learning_rate": 4.370536688807809e-05, "loss": 0.7399, "step": 7600 }, { "epoch": 0.2189166432781364, "grad_norm": 984849.5625, "learning_rate": 4.3393519817881315e-05, "loss": 0.6723, "step": 7800 }, { "epoch": 0.22452989054167835, "grad_norm": 2291592.5, "learning_rate": 4.3081672747684535e-05, "loss": 0.6866, "step": 8000 }, { "epoch": 0.22452989054167835, "eval_loss": 5.077595233917236, "eval_runtime": 81.793, "eval_samples_per_second": 80.802, "eval_sequential_score": 0.7714443289674112, "eval_steps_per_second": 1.272, "eval_sts-dev-1152_pearson_cosine": 0.7519731587462412, "eval_sts-dev-1152_spearman_cosine": 0.7714443289674112, "eval_sts-dev-512_pearson_cosine": 0.74485948560521, "eval_sts-dev-512_spearman_cosine": 0.7666552681351647, "eval_sts-dev-768_pearson_cosine": 0.7434046002477062, "eval_sts-dev-768_spearman_cosine": 0.7663030810953583, "eval_sts-dev-960_pearson_cosine": 0.7477923465698348, "eval_sts-dev-960_spearman_cosine": 0.7687824273352952, "step": 8000 }, { "epoch": 0.23014313780522033, "grad_norm": 854538.25, "learning_rate": 4.276982567748776e-05, "loss": 0.6556, "step": 8200 }, { "epoch": 0.2357563850687623, "grad_norm": 567974.3125, "learning_rate": 4.245797860729099e-05, "loss": 0.6886, "step": 8400 }, { "epoch": 0.23856300870053326, "eval_loss": 4.77580451965332, "eval_runtime": 81.3533, "eval_samples_per_second": 81.238, "eval_sequential_score": 0.7845369317488485, "eval_steps_per_second": 1.278, "eval_sts-dev-1152_pearson_cosine": 0.7759169223036488, "eval_sts-dev-1152_spearman_cosine": 0.7845369317488485, "eval_sts-dev-512_pearson_cosine": 0.7701848660015407, "eval_sts-dev-512_spearman_cosine": 0.7810897586341158, "eval_sts-dev-768_pearson_cosine": 0.7706226329945012, "eval_sts-dev-768_spearman_cosine": 0.7807681848391878, "eval_sts-dev-960_pearson_cosine": 0.7736197337766615, "eval_sts-dev-960_spearman_cosine": 0.7827940121873471, "step": 8500 }, { "epoch": 0.24136963233230424, "grad_norm": 984615.125, "learning_rate": 4.214613153709421e-05, "loss": 0.685, "step": 8600 }, { "epoch": 0.2469828795958462, "grad_norm": 891299.0625, "learning_rate": 4.183428446689743e-05, "loss": 0.6401, "step": 8800 }, { "epoch": 0.25259612685938815, "grad_norm": 815193.5, "learning_rate": 4.152243739670066e-05, "loss": 0.6617, "step": 9000 }, { "epoch": 0.25259612685938815, "eval_loss": 4.602816581726074, "eval_runtime": 78.6877, "eval_samples_per_second": 83.99, "eval_sequential_score": 0.7821317387888623, "eval_steps_per_second": 1.322, "eval_sts-dev-1152_pearson_cosine": 0.7751221600500908, "eval_sts-dev-1152_spearman_cosine": 0.7821317387888623, "eval_sts-dev-512_pearson_cosine": 0.7694106022891276, "eval_sts-dev-512_spearman_cosine": 0.7776661813002868, "eval_sts-dev-768_pearson_cosine": 0.7688507047470482, "eval_sts-dev-768_spearman_cosine": 0.7773842606816174, "eval_sts-dev-960_pearson_cosine": 0.772897246519623, "eval_sts-dev-960_spearman_cosine": 0.7804687323409016, "step": 9000 }, { "epoch": 0.2582093741229301, "grad_norm": 677073.1875, "learning_rate": 4.121059032650389e-05, "loss": 0.6208, "step": 9200 }, { "epoch": 0.26382262138647206, "grad_norm": 998130.3125, "learning_rate": 4.089874325630711e-05, "loss": 0.6307, "step": 9400 }, { "epoch": 0.26662924501824303, "eval_loss": 4.539032459259033, "eval_runtime": 78.0206, "eval_samples_per_second": 84.708, "eval_sequential_score": 0.7853296979011097, "eval_steps_per_second": 1.333, "eval_sts-dev-1152_pearson_cosine": 0.7765914965051797, "eval_sts-dev-1152_spearman_cosine": 0.7853296979011097, "eval_sts-dev-512_pearson_cosine": 0.7723537190754219, "eval_sts-dev-512_spearman_cosine": 0.7823374065712144, "eval_sts-dev-768_pearson_cosine": 0.7715991971997825, "eval_sts-dev-768_spearman_cosine": 0.7821482216403839, "eval_sts-dev-960_pearson_cosine": 0.7750935828664101, "eval_sts-dev-960_spearman_cosine": 0.7843683074967508, "step": 9500 }, { "epoch": 0.269435868650014, "grad_norm": 4926974.0, "learning_rate": 4.0586896186110335e-05, "loss": 0.6557, "step": 9600 }, { "epoch": 0.275049115913556, "grad_norm": 1349007.875, "learning_rate": 4.027504911591356e-05, "loss": 0.6102, "step": 9800 }, { "epoch": 0.280662363177098, "grad_norm": 4552381.5, "learning_rate": 3.996320204571678e-05, "loss": 0.5917, "step": 10000 }, { "epoch": 0.280662363177098, "eval_loss": 4.608828067779541, "eval_runtime": 77.5783, "eval_samples_per_second": 85.191, "eval_sequential_score": 0.7826250058709692, "eval_steps_per_second": 1.341, "eval_sts-dev-1152_pearson_cosine": 0.7759367710047786, "eval_sts-dev-1152_spearman_cosine": 0.7826250058709692, "eval_sts-dev-512_pearson_cosine": 0.7683710236695287, "eval_sts-dev-512_spearman_cosine": 0.7770467944017624, "eval_sts-dev-768_pearson_cosine": 0.7700346176363122, "eval_sts-dev-768_spearman_cosine": 0.7779312583550618, "eval_sts-dev-960_pearson_cosine": 0.7737155642846232, "eval_sts-dev-960_spearman_cosine": 0.7807350237124752, "step": 10000 }, { "epoch": 0.2862756104406399, "grad_norm": 836897.8125, "learning_rate": 3.9651354975520004e-05, "loss": 0.5845, "step": 10200 }, { "epoch": 0.2918888577041819, "grad_norm": 735510.0625, "learning_rate": 3.933950790532323e-05, "loss": 0.6018, "step": 10400 }, { "epoch": 0.29469548133595286, "eval_loss": 4.563432216644287, "eval_runtime": 77.4091, "eval_samples_per_second": 85.378, "eval_sequential_score": 0.7901603432215759, "eval_steps_per_second": 1.344, "eval_sts-dev-1152_pearson_cosine": 0.7810780782124218, "eval_sts-dev-1152_spearman_cosine": 0.7901603432215759, "eval_sts-dev-512_pearson_cosine": 0.7767016322826037, "eval_sts-dev-512_spearman_cosine": 0.787071277210133, "eval_sts-dev-768_pearson_cosine": 0.7756406085198688, "eval_sts-dev-768_spearman_cosine": 0.786612173354875, "eval_sts-dev-960_pearson_cosine": 0.7789839662649704, "eval_sts-dev-960_spearman_cosine": 0.7885719593916782, "step": 10500 }, { "epoch": 0.29750210496772383, "grad_norm": 1244390.5, "learning_rate": 3.902766083512646e-05, "loss": 0.5859, "step": 10600 }, { "epoch": 0.3031153522312658, "grad_norm": 1826701.25, "learning_rate": 3.871581376492968e-05, "loss": 0.5933, "step": 10800 }, { "epoch": 0.30872859949480774, "grad_norm": 1036837.75, "learning_rate": 3.840396669473291e-05, "loss": 0.5717, "step": 11000 }, { "epoch": 0.30872859949480774, "eval_loss": 4.405139446258545, "eval_runtime": 80.2702, "eval_samples_per_second": 82.334, "eval_sequential_score": 0.7930013510501455, "eval_steps_per_second": 1.296, "eval_sts-dev-1152_pearson_cosine": 0.7842500678425964, "eval_sts-dev-1152_spearman_cosine": 0.7930013510501455, "eval_sts-dev-512_pearson_cosine": 0.7804560477931245, "eval_sts-dev-512_spearman_cosine": 0.790194108314055, "eval_sts-dev-768_pearson_cosine": 0.7795286908922767, "eval_sts-dev-768_spearman_cosine": 0.7903488938053814, "eval_sts-dev-960_pearson_cosine": 0.7823368063838988, "eval_sts-dev-960_spearman_cosine": 0.7916644882542199, "step": 11000 }, { "epoch": 0.3143418467583497, "grad_norm": 505930.4375, "learning_rate": 3.809211962453613e-05, "loss": 0.5719, "step": 11200 }, { "epoch": 0.31995509402189165, "grad_norm": 573586.3125, "learning_rate": 3.7780272554339355e-05, "loss": 0.5422, "step": 11400 }, { "epoch": 0.3227617176536626, "eval_loss": 4.44298791885376, "eval_runtime": 79.3492, "eval_samples_per_second": 83.29, "eval_sequential_score": 0.7942242704582879, "eval_steps_per_second": 1.311, "eval_sts-dev-1152_pearson_cosine": 0.7875744329285601, "eval_sts-dev-1152_spearman_cosine": 0.7942242704582879, "eval_sts-dev-512_pearson_cosine": 0.7840443572822089, "eval_sts-dev-512_spearman_cosine": 0.7920209001098614, "eval_sts-dev-768_pearson_cosine": 0.7834004871669387, "eval_sts-dev-768_spearman_cosine": 0.7917739294182365, "eval_sts-dev-960_pearson_cosine": 0.7857857853856216, "eval_sts-dev-960_spearman_cosine": 0.7931912176322746, "step": 11500 }, { "epoch": 0.3255683412854336, "grad_norm": 2376617.5, "learning_rate": 3.7468425484142576e-05, "loss": 0.527, "step": 11600 }, { "epoch": 0.33118158854897556, "grad_norm": 1004661.5, "learning_rate": 3.7156578413945803e-05, "loss": 0.5291, "step": 11800 }, { "epoch": 0.33679483581251757, "grad_norm": 681331.25, "learning_rate": 3.684473134374903e-05, "loss": 0.542, "step": 12000 }, { "epoch": 0.33679483581251757, "eval_loss": 4.3000807762146, "eval_runtime": 78.0165, "eval_samples_per_second": 84.713, "eval_sequential_score": 0.7933961511062919, "eval_steps_per_second": 1.333, "eval_sts-dev-1152_pearson_cosine": 0.7901723077330912, "eval_sts-dev-1152_spearman_cosine": 0.7933961511062919, "eval_sts-dev-512_pearson_cosine": 0.7863821340291279, "eval_sts-dev-512_spearman_cosine": 0.7907396171652296, "eval_sts-dev-768_pearson_cosine": 0.7861497979708226, "eval_sts-dev-768_spearman_cosine": 0.7905983485083213, "eval_sts-dev-960_pearson_cosine": 0.7892627814382922, "eval_sts-dev-960_spearman_cosine": 0.7927711835546136, "step": 12000 }, { "epoch": 0.3424080830760595, "grad_norm": 339770.1875, "learning_rate": 3.653288427355225e-05, "loss": 0.5213, "step": 12200 }, { "epoch": 0.3480213303396015, "grad_norm": 16329254.0, "learning_rate": 3.622103720335547e-05, "loss": 0.5226, "step": 12400 }, { "epoch": 0.35082795397137245, "eval_loss": 4.8531880378723145, "eval_runtime": 78.4497, "eval_samples_per_second": 84.245, "eval_sequential_score": 0.7723953575556768, "eval_steps_per_second": 1.326, "eval_sts-dev-1152_pearson_cosine": 0.7559096984425644, "eval_sts-dev-1152_spearman_cosine": 0.7723953575556768, "eval_sts-dev-512_pearson_cosine": 0.7513107904922858, "eval_sts-dev-512_spearman_cosine": 0.7687285653830139, "eval_sts-dev-768_pearson_cosine": 0.7485837494255871, "eval_sts-dev-768_spearman_cosine": 0.7673974329389454, "eval_sts-dev-960_pearson_cosine": 0.7527733497014006, "eval_sts-dev-960_spearman_cosine": 0.7700910811459013, "step": 12500 }, { "epoch": 0.35363457760314343, "grad_norm": 751577.375, "learning_rate": 3.59091901331587e-05, "loss": 0.5111, "step": 12600 }, { "epoch": 0.3592478248666854, "grad_norm": 1020267.75, "learning_rate": 3.559734306296193e-05, "loss": 0.51, "step": 12800 }, { "epoch": 0.36486107213022734, "grad_norm": 2245223.25, "learning_rate": 3.528549599276515e-05, "loss": 0.5439, "step": 13000 }, { "epoch": 0.36486107213022734, "eval_loss": 4.562457084655762, "eval_runtime": 76.5025, "eval_samples_per_second": 86.389, "eval_sequential_score": 0.7871283857415448, "eval_steps_per_second": 1.359, "eval_sts-dev-1152_pearson_cosine": 0.7813516950726106, "eval_sts-dev-1152_spearman_cosine": 0.7871283857415448, "eval_sts-dev-512_pearson_cosine": 0.7790901108872998, "eval_sts-dev-512_spearman_cosine": 0.785904654893658, "eval_sts-dev-768_pearson_cosine": 0.7768149257929241, "eval_sts-dev-768_spearman_cosine": 0.7839641875290246, "eval_sts-dev-960_pearson_cosine": 0.7799312425424749, "eval_sts-dev-960_spearman_cosine": 0.7859198424326749, "step": 13000 }, { "epoch": 0.3704743193937693, "grad_norm": 295381.71875, "learning_rate": 3.4973648922568375e-05, "loss": 0.4944, "step": 13200 }, { "epoch": 0.37608756665731125, "grad_norm": 258496.15625, "learning_rate": 3.46618018523716e-05, "loss": 0.5055, "step": 13400 }, { "epoch": 0.3788941902890822, "eval_loss": 4.690097808837891, "eval_runtime": 77.0112, "eval_samples_per_second": 85.819, "eval_sequential_score": 0.7716690451444436, "eval_steps_per_second": 1.35, "eval_sts-dev-1152_pearson_cosine": 0.7550740608400445, "eval_sts-dev-1152_spearman_cosine": 0.7716690451444436, "eval_sts-dev-512_pearson_cosine": 0.7512508042826231, "eval_sts-dev-512_spearman_cosine": 0.7686861357730667, "eval_sts-dev-768_pearson_cosine": 0.7485196351380123, "eval_sts-dev-768_spearman_cosine": 0.7674469031229442, "eval_sts-dev-960_pearson_cosine": 0.7531056155361794, "eval_sts-dev-960_spearman_cosine": 0.770399492414998, "step": 13500 }, { "epoch": 0.3817008139208532, "grad_norm": 1523134.625, "learning_rate": 3.4349954782174824e-05, "loss": 0.4914, "step": 13600 }, { "epoch": 0.38731406118439515, "grad_norm": 824847.5625, "learning_rate": 3.4038107711978044e-05, "loss": 0.4832, "step": 13800 }, { "epoch": 0.3929273084479371, "grad_norm": 942315.875, "learning_rate": 3.372626064178127e-05, "loss": 0.4974, "step": 14000 }, { "epoch": 0.3929273084479371, "eval_loss": 4.3223676681518555, "eval_runtime": 76.7925, "eval_samples_per_second": 86.063, "eval_sequential_score": 0.7833165592743722, "eval_steps_per_second": 1.354, "eval_sts-dev-1152_pearson_cosine": 0.7744088760069867, "eval_sts-dev-1152_spearman_cosine": 0.7833165592743722, "eval_sts-dev-512_pearson_cosine": 0.7698343591511576, "eval_sts-dev-512_spearman_cosine": 0.7802103790467256, "eval_sts-dev-768_pearson_cosine": 0.7695304076449843, "eval_sts-dev-768_spearman_cosine": 0.780087860180158, "eval_sts-dev-960_pearson_cosine": 0.7727501854173389, "eval_sts-dev-960_spearman_cosine": 0.7819591085796292, "step": 14000 }, { "epoch": 0.3985405557114791, "grad_norm": 755688.0625, "learning_rate": 3.34144135715845e-05, "loss": 0.4834, "step": 14200 }, { "epoch": 0.40415380297502107, "grad_norm": 1176346.25, "learning_rate": 3.310256650138772e-05, "loss": 0.4526, "step": 14400 }, { "epoch": 0.40696042660679205, "eval_loss": 4.603125095367432, "eval_runtime": 77.0079, "eval_samples_per_second": 85.822, "eval_sequential_score": 0.7798168210294973, "eval_steps_per_second": 1.351, "eval_sts-dev-1152_pearson_cosine": 0.7695286357100837, "eval_sts-dev-1152_spearman_cosine": 0.7798168210294973, "eval_sts-dev-512_pearson_cosine": 0.7680031892756588, "eval_sts-dev-512_spearman_cosine": 0.7793198654141038, "eval_sts-dev-768_pearson_cosine": 0.765316531020936, "eval_sts-dev-768_spearman_cosine": 0.7771469239892022, "eval_sts-dev-960_pearson_cosine": 0.7678963030870366, "eval_sts-dev-960_spearman_cosine": 0.7788543197202844, "step": 14500 }, { "epoch": 0.409767050238563, "grad_norm": 555539.0625, "learning_rate": 3.279071943119095e-05, "loss": 0.4621, "step": 14600 }, { "epoch": 0.415380297502105, "grad_norm": 293199.1875, "learning_rate": 3.2478872360994175e-05, "loss": 0.4483, "step": 14800 }, { "epoch": 0.42099354476564693, "grad_norm": 1106311.5, "learning_rate": 3.2167025290797396e-05, "loss": 0.4422, "step": 15000 }, { "epoch": 0.42099354476564693, "eval_loss": 4.340238094329834, "eval_runtime": 78.5719, "eval_samples_per_second": 84.114, "eval_sequential_score": 0.7852896447445069, "eval_steps_per_second": 1.324, "eval_sts-dev-1152_pearson_cosine": 0.7790739260952181, "eval_sts-dev-1152_spearman_cosine": 0.7852896447445069, "eval_sts-dev-512_pearson_cosine": 0.7764628401440794, "eval_sts-dev-512_spearman_cosine": 0.7835744067282309, "eval_sts-dev-768_pearson_cosine": 0.7743919883086495, "eval_sts-dev-768_spearman_cosine": 0.7820714131385429, "eval_sts-dev-960_pearson_cosine": 0.7773513056662522, "eval_sts-dev-960_spearman_cosine": 0.7840212510775064, "step": 15000 }, { "epoch": 0.4266067920291889, "grad_norm": 610679.5625, "learning_rate": 3.1855178220600616e-05, "loss": 0.4144, "step": 15200 }, { "epoch": 0.43222003929273084, "grad_norm": 687958.4375, "learning_rate": 3.1543331150403844e-05, "loss": 0.4099, "step": 15400 }, { "epoch": 0.4350266629245018, "eval_loss": 4.453821182250977, "eval_runtime": 77.8538, "eval_samples_per_second": 84.89, "eval_sequential_score": 0.7885469182471105, "eval_steps_per_second": 1.336, "eval_sts-dev-1152_pearson_cosine": 0.7803450152456344, "eval_sts-dev-1152_spearman_cosine": 0.7885469182471105, "eval_sts-dev-512_pearson_cosine": 0.7769126522889329, "eval_sts-dev-512_spearman_cosine": 0.7860781907940958, "eval_sts-dev-768_pearson_cosine": 0.7768788991918325, "eval_sts-dev-768_spearman_cosine": 0.7860822710268899, "eval_sts-dev-960_pearson_cosine": 0.7790539464640599, "eval_sts-dev-960_spearman_cosine": 0.7873534072847509, "step": 15500 }, { "epoch": 0.4378332865562728, "grad_norm": 465139.34375, "learning_rate": 3.123148408020707e-05, "loss": 0.4196, "step": 15600 }, { "epoch": 0.44344653381981475, "grad_norm": 1473414.0, "learning_rate": 3.091963701001029e-05, "loss": 0.4273, "step": 15800 }, { "epoch": 0.4490597810833567, "grad_norm": 1426157.25, "learning_rate": 3.060778993981352e-05, "loss": 1.9924, "step": 16000 }, { "epoch": 0.4490597810833567, "eval_loss": 3.5090487003326416, "eval_runtime": 79.6383, "eval_samples_per_second": 82.988, "eval_sequential_score": 0.7927361346499643, "eval_steps_per_second": 1.306, "eval_sts-dev-1152_pearson_cosine": 0.790420349742375, "eval_sts-dev-1152_spearman_cosine": 0.7927361346499643, "eval_sts-dev-512_pearson_cosine": 0.7865016349246065, "eval_sts-dev-512_spearman_cosine": 0.7896668024220279, "eval_sts-dev-768_pearson_cosine": 0.7865658784180989, "eval_sts-dev-768_spearman_cosine": 0.7894374189283365, "eval_sts-dev-960_pearson_cosine": 0.78835917820683, "eval_sts-dev-960_spearman_cosine": 0.7907010125477784, "step": 16000 }, { "epoch": 0.45467302834689866, "grad_norm": 2128073.0, "learning_rate": 3.029594286961674e-05, "loss": 2.0174, "step": 16200 }, { "epoch": 0.46028627561044066, "grad_norm": 1494404.25, "learning_rate": 2.9984095799419964e-05, "loss": 1.9566, "step": 16400 }, { "epoch": 0.46309289924221164, "eval_loss": 3.305478572845459, "eval_runtime": 76.7558, "eval_samples_per_second": 86.104, "eval_sequential_score": 0.8035377612749389, "eval_steps_per_second": 1.355, "eval_sts-dev-1152_pearson_cosine": 0.7997045755932278, "eval_sts-dev-1152_spearman_cosine": 0.8035377612749389, "eval_sts-dev-512_pearson_cosine": 0.7962257322892368, "eval_sts-dev-512_spearman_cosine": 0.8010941658538488, "eval_sts-dev-768_pearson_cosine": 0.7967126438007915, "eval_sts-dev-768_spearman_cosine": 0.8016654322418212, "eval_sts-dev-960_pearson_cosine": 0.7985639882269593, "eval_sts-dev-960_spearman_cosine": 0.8025996694295968, "step": 16500 }, { "epoch": 0.4658995228739826, "grad_norm": 21955176.0, "learning_rate": 2.967224872922319e-05, "loss": 1.8733, "step": 16600 }, { "epoch": 0.4715127701375246, "grad_norm": 1634486.5, "learning_rate": 2.9360401659026416e-05, "loss": 1.8465, "step": 16800 }, { "epoch": 0.4771260174010665, "grad_norm": 1099246.25, "learning_rate": 2.904855458882964e-05, "loss": 1.8083, "step": 17000 }, { "epoch": 0.4771260174010665, "eval_loss": 3.1462347507476807, "eval_runtime": 76.7838, "eval_samples_per_second": 86.073, "eval_sequential_score": 0.8044901468397658, "eval_steps_per_second": 1.354, "eval_sts-dev-1152_pearson_cosine": 0.8003150062678777, "eval_sts-dev-1152_spearman_cosine": 0.8044901468397658, "eval_sts-dev-512_pearson_cosine": 0.7967573482929446, "eval_sts-dev-512_spearman_cosine": 0.8015237418575695, "eval_sts-dev-768_pearson_cosine": 0.7971621831925142, "eval_sts-dev-768_spearman_cosine": 0.801918123779033, "eval_sts-dev-960_pearson_cosine": 0.7989525078627395, "eval_sts-dev-960_spearman_cosine": 0.8033675149464842, "step": 17000 }, { "epoch": 0.4827392646646085, "grad_norm": 1532188.125, "learning_rate": 2.8736707518632867e-05, "loss": 1.7193, "step": 17200 }, { "epoch": 0.48835251192815043, "grad_norm": 1495786.125, "learning_rate": 2.8424860448436085e-05, "loss": 1.7423, "step": 17400 }, { "epoch": 0.4911591355599214, "eval_loss": 3.0544025897979736, "eval_runtime": 77.4424, "eval_samples_per_second": 85.341, "eval_sequential_score": 0.8085294051469485, "eval_steps_per_second": 1.343, "eval_sts-dev-1152_pearson_cosine": 0.806796528773235, "eval_sts-dev-1152_spearman_cosine": 0.8085294051469485, "eval_sts-dev-512_pearson_cosine": 0.8042370140263899, "eval_sts-dev-512_spearman_cosine": 0.8066761351903039, "eval_sts-dev-768_pearson_cosine": 0.8033611078675769, "eval_sts-dev-768_spearman_cosine": 0.8060225357046799, "eval_sts-dev-960_pearson_cosine": 0.8053597943880864, "eval_sts-dev-960_spearman_cosine": 0.8073196764122358, "step": 17500 }, { "epoch": 0.4939657591916924, "grad_norm": 1554646.625, "learning_rate": 2.8113013378239312e-05, "loss": 1.6114, "step": 17600 }, { "epoch": 0.49957900645523434, "grad_norm": 1555168.625, "learning_rate": 2.7801166308042536e-05, "loss": 1.6524, "step": 17800 }, { "epoch": 0.5051922537187763, "grad_norm": 2316977.25, "learning_rate": 2.748931923784576e-05, "loss": 1.568, "step": 18000 }, { "epoch": 0.5051922537187763, "eval_loss": 3.023185968399048, "eval_runtime": 77.2548, "eval_samples_per_second": 85.548, "eval_sequential_score": 0.8160116208449028, "eval_steps_per_second": 1.346, "eval_sts-dev-1152_pearson_cosine": 0.8098685109159951, "eval_sts-dev-1152_spearman_cosine": 0.8160116208449028, "eval_sts-dev-512_pearson_cosine": 0.8067482146442919, "eval_sts-dev-512_spearman_cosine": 0.8138083394885887, "eval_sts-dev-768_pearson_cosine": 0.8066934851514658, "eval_sts-dev-768_spearman_cosine": 0.8137348986202628, "eval_sts-dev-960_pearson_cosine": 0.8086043862820518, "eval_sts-dev-960_spearman_cosine": 0.8147544112729422, "step": 18000 }, { "epoch": 0.5108055009823183, "grad_norm": 1329637.625, "learning_rate": 2.7177472167648988e-05, "loss": 1.5263, "step": 18200 }, { "epoch": 0.5164187482458602, "grad_norm": 1966328.0, "learning_rate": 2.6865625097452212e-05, "loss": 1.5547, "step": 18400 }, { "epoch": 0.5192253718776312, "eval_loss": 2.870816469192505, "eval_runtime": 76.6326, "eval_samples_per_second": 86.243, "eval_sequential_score": 0.8176598949005246, "eval_steps_per_second": 1.357, "eval_sts-dev-1152_pearson_cosine": 0.814666190335647, "eval_sts-dev-1152_spearman_cosine": 0.8176598949005246, "eval_sts-dev-512_pearson_cosine": 0.8121636324942605, "eval_sts-dev-512_spearman_cosine": 0.8159107971728017, "eval_sts-dev-768_pearson_cosine": 0.8122018182896737, "eval_sts-dev-768_spearman_cosine": 0.816248481623874, "eval_sts-dev-960_pearson_cosine": 0.813853882558624, "eval_sts-dev-960_spearman_cosine": 0.8171316174695632, "step": 18500 }, { "epoch": 0.5220319955094022, "grad_norm": 1405333.625, "learning_rate": 2.6553778027255433e-05, "loss": 1.5059, "step": 18600 }, { "epoch": 0.5276452427729441, "grad_norm": 2138761.0, "learning_rate": 2.6241930957058657e-05, "loss": 1.4385, "step": 18800 }, { "epoch": 0.5332584900364861, "grad_norm": 2403917.0, "learning_rate": 2.5930083886861884e-05, "loss": 1.476, "step": 19000 }, { "epoch": 0.5332584900364861, "eval_loss": 2.9468226432800293, "eval_runtime": 76.877, "eval_samples_per_second": 85.969, "eval_sequential_score": 0.81019252718923, "eval_steps_per_second": 1.353, "eval_sts-dev-1152_pearson_cosine": 0.8023306627696717, "eval_sts-dev-1152_spearman_cosine": 0.81019252718923, "eval_sts-dev-512_pearson_cosine": 0.8017622030828988, "eval_sts-dev-512_spearman_cosine": 0.8093692797674851, "eval_sts-dev-768_pearson_cosine": 0.8003993423781782, "eval_sts-dev-768_spearman_cosine": 0.8091560440850984, "eval_sts-dev-960_pearson_cosine": 0.8019211195358515, "eval_sts-dev-960_spearman_cosine": 0.8101363718932728, "step": 19000 }, { "epoch": 0.538871737300028, "grad_norm": 1055316.25, "learning_rate": 2.561823681666511e-05, "loss": 1.4558, "step": 19200 }, { "epoch": 0.54448498456357, "grad_norm": 1079136.125, "learning_rate": 2.5306389746468333e-05, "loss": 1.4557, "step": 19400 }, { "epoch": 0.547291608195341, "eval_loss": 2.8981781005859375, "eval_runtime": 79.4411, "eval_samples_per_second": 83.194, "eval_sequential_score": 0.8094444352298047, "eval_steps_per_second": 1.309, "eval_sts-dev-1152_pearson_cosine": 0.8031328356570406, "eval_sts-dev-1152_spearman_cosine": 0.8094444352298047, "eval_sts-dev-512_pearson_cosine": 0.8001098459602658, "eval_sts-dev-512_spearman_cosine": 0.8070996808860118, "eval_sts-dev-768_pearson_cosine": 0.8001671724511775, "eval_sts-dev-768_spearman_cosine": 0.807121764709248, "eval_sts-dev-960_pearson_cosine": 0.8020898505862861, "eval_sts-dev-960_spearman_cosine": 0.8084113769099328, "step": 19500 }, { "epoch": 0.550098231827112, "grad_norm": 1276695.375, "learning_rate": 2.4994542676271557e-05, "loss": 1.4552, "step": 19600 }, { "epoch": 0.555711479090654, "grad_norm": 1328590.75, "learning_rate": 2.468269560607478e-05, "loss": 1.4342, "step": 19800 }, { "epoch": 0.561324726354196, "grad_norm": 1148225.75, "learning_rate": 2.4370848535878008e-05, "loss": 1.4503, "step": 20000 }, { "epoch": 0.561324726354196, "eval_loss": 2.807321310043335, "eval_runtime": 77.894, "eval_samples_per_second": 84.846, "eval_sequential_score": 0.8179010934691895, "eval_steps_per_second": 1.335, "eval_sts-dev-1152_pearson_cosine": 0.8140052747106419, "eval_sts-dev-1152_spearman_cosine": 0.8179010934691895, "eval_sts-dev-512_pearson_cosine": 0.8113404216022915, "eval_sts-dev-512_spearman_cosine": 0.8159118696426358, "eval_sts-dev-768_pearson_cosine": 0.8108446270228379, "eval_sts-dev-768_spearman_cosine": 0.815575862581075, "eval_sts-dev-960_pearson_cosine": 0.8128306079700292, "eval_sts-dev-960_spearman_cosine": 0.8169448135786315, "step": 20000 }, { "epoch": 0.5669379736177379, "grad_norm": 1494490.25, "learning_rate": 2.405900146568123e-05, "loss": 1.391, "step": 20200 }, { "epoch": 0.5725512208812799, "grad_norm": 1219068.5, "learning_rate": 2.3747154395484456e-05, "loss": 1.3529, "step": 20400 }, { "epoch": 0.5753578445130508, "eval_loss": 2.697373151779175, "eval_runtime": 77.2603, "eval_samples_per_second": 85.542, "eval_sequential_score": 0.818387454814582, "eval_steps_per_second": 1.346, "eval_sts-dev-1152_pearson_cosine": 0.814425658636787, "eval_sts-dev-1152_spearman_cosine": 0.818387454814582, "eval_sts-dev-512_pearson_cosine": 0.8114864131532289, "eval_sts-dev-512_spearman_cosine": 0.816474256076468, "eval_sts-dev-768_pearson_cosine": 0.8111751402507601, "eval_sts-dev-768_spearman_cosine": 0.8161965909973262, "eval_sts-dev-960_pearson_cosine": 0.8133366326386466, "eval_sts-dev-960_spearman_cosine": 0.817509136299924, "step": 20500 }, { "epoch": 0.5781644681448218, "grad_norm": 1434187.0, "learning_rate": 2.343530732528768e-05, "loss": 1.3428, "step": 20600 }, { "epoch": 0.5837777154083638, "grad_norm": 9602563.0, "learning_rate": 2.3123460255090905e-05, "loss": 1.3401, "step": 20800 }, { "epoch": 0.5893909626719057, "grad_norm": 1751803.25, "learning_rate": 2.281161318489413e-05, "loss": 1.3809, "step": 21000 }, { "epoch": 0.5893909626719057, "eval_loss": 2.703920602798462, "eval_runtime": 77.5938, "eval_samples_per_second": 85.174, "eval_sequential_score": 0.8188296877385577, "eval_steps_per_second": 1.34, "eval_sts-dev-1152_pearson_cosine": 0.8145212969625442, "eval_sts-dev-1152_spearman_cosine": 0.8188296877385577, "eval_sts-dev-512_pearson_cosine": 0.8121156149968475, "eval_sts-dev-512_spearman_cosine": 0.8170665731423932, "eval_sts-dev-768_pearson_cosine": 0.8113063721736813, "eval_sts-dev-768_spearman_cosine": 0.8162345284905963, "eval_sts-dev-960_pearson_cosine": 0.813392544100018, "eval_sts-dev-960_spearman_cosine": 0.8177012935138736, "step": 21000 }, { "epoch": 0.5950042099354477, "grad_norm": 1005006.3125, "learning_rate": 2.2499766114697353e-05, "loss": 1.3193, "step": 21200 }, { "epoch": 0.6006174571989896, "grad_norm": 2510840.0, "learning_rate": 2.2187919044500577e-05, "loss": 1.2531, "step": 21400 }, { "epoch": 0.6034240808307606, "eval_loss": 2.6553146839141846, "eval_runtime": 76.7412, "eval_samples_per_second": 86.121, "eval_sequential_score": 0.8132629704436432, "eval_steps_per_second": 1.355, "eval_sts-dev-1152_pearson_cosine": 0.8100792978741007, "eval_sts-dev-1152_spearman_cosine": 0.8132629704436432, "eval_sts-dev-512_pearson_cosine": 0.8064513413918455, "eval_sts-dev-512_spearman_cosine": 0.810722844279224, "eval_sts-dev-768_pearson_cosine": 0.8070561573065372, "eval_sts-dev-768_spearman_cosine": 0.8113323371597876, "eval_sts-dev-960_pearson_cosine": 0.8088718794978185, "eval_sts-dev-960_spearman_cosine": 0.812389308544286, "step": 21500 }, { "epoch": 0.6062307044625316, "grad_norm": 919804.125, "learning_rate": 2.18760719743038e-05, "loss": 1.3294, "step": 21600 }, { "epoch": 0.6118439517260735, "grad_norm": 1363194.375, "learning_rate": 2.156422490410703e-05, "loss": 1.3076, "step": 21800 }, { "epoch": 0.6174571989896155, "grad_norm": 1170667.75, "learning_rate": 2.125237783391025e-05, "loss": 1.2634, "step": 22000 }, { "epoch": 0.6174571989896155, "eval_loss": 2.615736484527588, "eval_runtime": 77.1191, "eval_samples_per_second": 85.699, "eval_sequential_score": 0.8135000785581838, "eval_steps_per_second": 1.349, "eval_sts-dev-1152_pearson_cosine": 0.8105061703368251, "eval_sts-dev-1152_spearman_cosine": 0.8135000785581838, "eval_sts-dev-512_pearson_cosine": 0.8074183216846054, "eval_sts-dev-512_spearman_cosine": 0.8123168589567965, "eval_sts-dev-768_pearson_cosine": 0.808323777710555, "eval_sts-dev-768_spearman_cosine": 0.8122644439071114, "eval_sts-dev-960_pearson_cosine": 0.809931025467389, "eval_sts-dev-960_spearman_cosine": 0.8131210124611652, "step": 22000 }, { "epoch": 0.6230704462531574, "grad_norm": 2690514.75, "learning_rate": 2.0940530763713477e-05, "loss": 1.242, "step": 22200 }, { "epoch": 0.6286836935166994, "grad_norm": 687846.625, "learning_rate": 2.06286836935167e-05, "loss": 1.2545, "step": 22400 }, { "epoch": 0.6314903171484704, "eval_loss": 2.6083288192749023, "eval_runtime": 76.9345, "eval_samples_per_second": 85.904, "eval_sequential_score": 0.8198803113021629, "eval_steps_per_second": 1.352, "eval_sts-dev-1152_pearson_cosine": 0.8155088381997109, "eval_sts-dev-1152_spearman_cosine": 0.8198803113021629, "eval_sts-dev-512_pearson_cosine": 0.8119264111774269, "eval_sts-dev-512_spearman_cosine": 0.817475857375689, "eval_sts-dev-768_pearson_cosine": 0.8125582626459574, "eval_sts-dev-768_spearman_cosine": 0.8178761844864308, "eval_sts-dev-960_pearson_cosine": 0.8147256189246097, "eval_sts-dev-960_spearman_cosine": 0.8192721867604403, "step": 22500 }, { "epoch": 0.6342969407802413, "grad_norm": 1860284.25, "learning_rate": 2.0316836623319925e-05, "loss": 1.2362, "step": 22600 }, { "epoch": 0.6399101880437833, "grad_norm": 7078587.0, "learning_rate": 2.000498955312315e-05, "loss": 1.1474, "step": 22800 }, { "epoch": 0.6455234353073253, "grad_norm": 1408995.125, "learning_rate": 1.9693142482926373e-05, "loss": 1.2125, "step": 23000 }, { "epoch": 0.6455234353073253, "eval_loss": 2.5617470741271973, "eval_runtime": 76.5184, "eval_samples_per_second": 86.371, "eval_sequential_score": 0.8207731554186944, "eval_steps_per_second": 1.359, "eval_sts-dev-1152_pearson_cosine": 0.8178959248835184, "eval_sts-dev-1152_spearman_cosine": 0.8207731554186944, "eval_sts-dev-512_pearson_cosine": 0.8140061268760344, "eval_sts-dev-512_spearman_cosine": 0.8180611571485459, "eval_sts-dev-768_pearson_cosine": 0.8150538202678299, "eval_sts-dev-768_spearman_cosine": 0.8188299877250856, "eval_sts-dev-960_pearson_cosine": 0.8170381399447351, "eval_sts-dev-960_spearman_cosine": 0.8200353073212047, "step": 23000 }, { "epoch": 0.6511366825708672, "grad_norm": 1224838.5, "learning_rate": 1.9381295412729597e-05, "loss": 1.206, "step": 23200 }, { "epoch": 0.6567499298344092, "grad_norm": 855258.6875, "learning_rate": 1.906944834253282e-05, "loss": 1.1236, "step": 23400 }, { "epoch": 0.6595565534661801, "eval_loss": 2.5598337650299072, "eval_runtime": 77.1811, "eval_samples_per_second": 85.63, "eval_sequential_score": 0.8251304266663447, "eval_steps_per_second": 1.347, "eval_sts-dev-1152_pearson_cosine": 0.8201666525971009, "eval_sts-dev-1152_spearman_cosine": 0.8251304266663447, "eval_sts-dev-512_pearson_cosine": 0.8165225072856572, "eval_sts-dev-512_spearman_cosine": 0.8231945111392679, "eval_sts-dev-768_pearson_cosine": 0.8174329120885295, "eval_sts-dev-768_spearman_cosine": 0.8234756819083271, "eval_sts-dev-960_pearson_cosine": 0.8193489194006733, "eval_sts-dev-960_spearman_cosine": 0.8246625530393321, "step": 23500 }, { "epoch": 0.6623631770979511, "grad_norm": 919016.75, "learning_rate": 1.875760127233605e-05, "loss": 1.1785, "step": 23600 }, { "epoch": 0.6679764243614931, "grad_norm": 757121.4375, "learning_rate": 1.844575420213927e-05, "loss": 1.1376, "step": 23800 }, { "epoch": 0.6735896716250351, "grad_norm": 844441.125, "learning_rate": 1.8133907131942497e-05, "loss": 1.1386, "step": 24000 }, { "epoch": 0.6735896716250351, "eval_loss": 2.522897958755493, "eval_runtime": 79.1821, "eval_samples_per_second": 83.466, "eval_sequential_score": 0.8207794501178992, "eval_steps_per_second": 1.313, "eval_sts-dev-1152_pearson_cosine": 0.8186757389035366, "eval_sts-dev-1152_spearman_cosine": 0.8207794501178992, "eval_sts-dev-512_pearson_cosine": 0.8149579009500032, "eval_sts-dev-512_spearman_cosine": 0.8186514991034529, "eval_sts-dev-768_pearson_cosine": 0.8158079152733716, "eval_sts-dev-768_spearman_cosine": 0.8186101448904235, "eval_sts-dev-960_pearson_cosine": 0.8173995719285795, "eval_sts-dev-960_spearman_cosine": 0.8198375969222076, "step": 24000 }, { "epoch": 0.6792029188885771, "grad_norm": 2238345.25, "learning_rate": 1.782206006174572e-05, "loss": 1.1293, "step": 24200 }, { "epoch": 0.684816166152119, "grad_norm": 1067979.625, "learning_rate": 1.7510212991548945e-05, "loss": 1.101, "step": 24400 }, { "epoch": 0.68762278978389, "eval_loss": 2.542306661605835, "eval_runtime": 78.4755, "eval_samples_per_second": 84.217, "eval_sequential_score": 0.8273983158949195, "eval_steps_per_second": 1.325, "eval_sts-dev-1152_pearson_cosine": 0.8233467985109526, "eval_sts-dev-1152_spearman_cosine": 0.8273983158949195, "eval_sts-dev-512_pearson_cosine": 0.8191791525417332, "eval_sts-dev-512_spearman_cosine": 0.8241280875045953, "eval_sts-dev-768_pearson_cosine": 0.8205748238510661, "eval_sts-dev-768_spearman_cosine": 0.8253162521368189, "eval_sts-dev-960_pearson_cosine": 0.8223154809807092, "eval_sts-dev-960_spearman_cosine": 0.8263287206559005, "step": 24500 }, { "epoch": 0.690429413415661, "grad_norm": 1075801.875, "learning_rate": 1.719836592135217e-05, "loss": 1.1306, "step": 24600 }, { "epoch": 0.696042660679203, "grad_norm": 1117327.75, "learning_rate": 1.6886518851155393e-05, "loss": 1.0517, "step": 24800 }, { "epoch": 0.7016559079427449, "grad_norm": 1052429.25, "learning_rate": 1.657467178095862e-05, "loss": 1.0617, "step": 25000 }, { "epoch": 0.7016559079427449, "eval_loss": 2.4987776279449463, "eval_runtime": 78.7361, "eval_samples_per_second": 83.939, "eval_sequential_score": 0.8318737027912343, "eval_steps_per_second": 1.321, "eval_sts-dev-1152_pearson_cosine": 0.828742240805668, "eval_sts-dev-1152_spearman_cosine": 0.8318737027912343, "eval_sts-dev-512_pearson_cosine": 0.8253460176937367, "eval_sts-dev-512_spearman_cosine": 0.8296126954497343, "eval_sts-dev-768_pearson_cosine": 0.8260069313893207, "eval_sts-dev-768_spearman_cosine": 0.8299749701259963, "eval_sts-dev-960_pearson_cosine": 0.8277316254972338, "eval_sts-dev-960_spearman_cosine": 0.8309505358919553, "step": 25000 }, { "epoch": 0.7072691552062869, "grad_norm": 1933842.375, "learning_rate": 1.626282471076184e-05, "loss": 1.0408, "step": 25200 }, { "epoch": 0.7128824024698288, "grad_norm": 1313519.75, "learning_rate": 1.595097764056507e-05, "loss": 1.0741, "step": 25400 }, { "epoch": 0.7156890261015998, "eval_loss": 2.4365484714508057, "eval_runtime": 76.9868, "eval_samples_per_second": 85.846, "eval_sequential_score": 0.8275808733889652, "eval_steps_per_second": 1.351, "eval_sts-dev-1152_pearson_cosine": 0.8235693092621815, "eval_sts-dev-1152_spearman_cosine": 0.8275808733889652, "eval_sts-dev-512_pearson_cosine": 0.8200900106962739, "eval_sts-dev-512_spearman_cosine": 0.8251150183246924, "eval_sts-dev-768_pearson_cosine": 0.8206274312682788, "eval_sts-dev-768_spearman_cosine": 0.8253381212332793, "eval_sts-dev-960_pearson_cosine": 0.8226055577499416, "eval_sts-dev-960_spearman_cosine": 0.8267093188750975, "step": 25500 }, { "epoch": 0.7184956497333708, "grad_norm": 2527256.75, "learning_rate": 1.5639130570368293e-05, "loss": 1.0373, "step": 25600 }, { "epoch": 0.7241088969969127, "grad_norm": 1548847.25, "learning_rate": 1.5327283500171517e-05, "loss": 1.0239, "step": 25800 }, { "epoch": 0.7297221442604547, "grad_norm": 771091.3125, "learning_rate": 1.5015436429974741e-05, "loss": 0.9982, "step": 26000 }, { "epoch": 0.7297221442604547, "eval_loss": 2.417414903640747, "eval_runtime": 76.5981, "eval_samples_per_second": 86.282, "eval_sequential_score": 0.8253319450226069, "eval_steps_per_second": 1.358, "eval_sts-dev-1152_pearson_cosine": 0.8229695273001155, "eval_sts-dev-1152_spearman_cosine": 0.8253319450226069, "eval_sts-dev-512_pearson_cosine": 0.8186723943926887, "eval_sts-dev-512_spearman_cosine": 0.8223105621184139, "eval_sts-dev-768_pearson_cosine": 0.8198835227077859, "eval_sts-dev-768_spearman_cosine": 0.8231779055377496, "eval_sts-dev-960_pearson_cosine": 0.8221140652235788, "eval_sts-dev-960_spearman_cosine": 0.8246685858536678, "step": 26000 }, { "epoch": 0.7353353915239966, "grad_norm": 1151759.625, "learning_rate": 1.4703589359777967e-05, "loss": 0.9829, "step": 26200 }, { "epoch": 0.7409486387875386, "grad_norm": 1203075.5, "learning_rate": 1.439174228958119e-05, "loss": 0.9758, "step": 26400 }, { "epoch": 0.7437552624193096, "eval_loss": 2.421391010284424, "eval_runtime": 78.2685, "eval_samples_per_second": 84.44, "eval_sequential_score": 0.8273907944767567, "eval_steps_per_second": 1.329, "eval_sts-dev-1152_pearson_cosine": 0.8245632242048817, "eval_sts-dev-1152_spearman_cosine": 0.8273907944767567, "eval_sts-dev-512_pearson_cosine": 0.821320018207968, "eval_sts-dev-512_spearman_cosine": 0.8249051957336987, "eval_sts-dev-768_pearson_cosine": 0.8222668265396057, "eval_sts-dev-768_spearman_cosine": 0.8258229540823162, "eval_sts-dev-960_pearson_cosine": 0.8239329103520061, "eval_sts-dev-960_spearman_cosine": 0.8268971547689833, "step": 26500 }, { "epoch": 0.7465618860510805, "grad_norm": 873702.5625, "learning_rate": 1.4079895219384415e-05, "loss": 1.0123, "step": 26600 }, { "epoch": 0.7521751333146225, "grad_norm": 1191796.125, "learning_rate": 1.3768048149187641e-05, "loss": 1.0156, "step": 26800 }, { "epoch": 0.7577883805781644, "grad_norm": 3614394.75, "learning_rate": 1.3456201078990863e-05, "loss": 0.9687, "step": 27000 }, { "epoch": 0.7577883805781644, "eval_loss": 2.381603240966797, "eval_runtime": 76.1545, "eval_samples_per_second": 86.784, "eval_sequential_score": 0.822164187171298, "eval_steps_per_second": 1.366, "eval_sts-dev-1152_pearson_cosine": 0.8179990780257289, "eval_sts-dev-1152_spearman_cosine": 0.822164187171298, "eval_sts-dev-512_pearson_cosine": 0.8141681847358285, "eval_sts-dev-512_spearman_cosine": 0.8190636570312585, "eval_sts-dev-768_pearson_cosine": 0.8154464750993287, "eval_sts-dev-768_spearman_cosine": 0.8202098023000759, "eval_sts-dev-960_pearson_cosine": 0.8171115628702965, "eval_sts-dev-960_spearman_cosine": 0.821364481930215, "step": 27000 }, { "epoch": 0.7634016278417064, "grad_norm": 1148854.5, "learning_rate": 1.3144354008794087e-05, "loss": 0.9569, "step": 27200 }, { "epoch": 0.7690148751052484, "grad_norm": 1114442.0, "learning_rate": 1.2832506938597313e-05, "loss": 0.9543, "step": 27400 }, { "epoch": 0.7718214987370193, "eval_loss": 2.355195999145508, "eval_runtime": 76.6074, "eval_samples_per_second": 86.271, "eval_sequential_score": 0.8289062223349991, "eval_steps_per_second": 1.358, "eval_sts-dev-1152_pearson_cosine": 0.8234371744631814, "eval_sts-dev-1152_spearman_cosine": 0.8289062223349991, "eval_sts-dev-512_pearson_cosine": 0.8207871259825303, "eval_sts-dev-512_spearman_cosine": 0.8272360451735147, "eval_sts-dev-768_pearson_cosine": 0.8214215565214141, "eval_sts-dev-768_spearman_cosine": 0.8275606625119857, "eval_sts-dev-960_pearson_cosine": 0.8229212299901374, "eval_sts-dev-960_spearman_cosine": 0.8283389770018872, "step": 27500 }, { "epoch": 0.7746281223687903, "grad_norm": 1473106.125, "learning_rate": 1.2520659868400536e-05, "loss": 0.9453, "step": 27600 }, { "epoch": 0.7802413696323323, "grad_norm": 1301560.0, "learning_rate": 1.2208812798203761e-05, "loss": 0.9948, "step": 27800 }, { "epoch": 0.7858546168958742, "grad_norm": 689391.125, "learning_rate": 1.1896965728006985e-05, "loss": 0.9874, "step": 28000 }, { "epoch": 0.7858546168958742, "eval_loss": 2.3867998123168945, "eval_runtime": 77.0268, "eval_samples_per_second": 85.801, "eval_sequential_score": 0.8280466360566167, "eval_steps_per_second": 1.35, "eval_sts-dev-1152_pearson_cosine": 0.8240164476876701, "eval_sts-dev-1152_spearman_cosine": 0.8280466360566167, "eval_sts-dev-512_pearson_cosine": 0.8215847717028831, "eval_sts-dev-512_spearman_cosine": 0.8264438807620287, "eval_sts-dev-768_pearson_cosine": 0.8221793287179034, "eval_sts-dev-768_spearman_cosine": 0.8268230282109075, "eval_sts-dev-960_pearson_cosine": 0.8233492812551204, "eval_sts-dev-960_spearman_cosine": 0.8274233448566846, "step": 28000 }, { "epoch": 0.7914678641594162, "grad_norm": 1482000.5, "learning_rate": 1.1585118657810211e-05, "loss": 0.8872, "step": 28200 }, { "epoch": 0.7970811114229582, "grad_norm": 11877957.0, "learning_rate": 1.1273271587613435e-05, "loss": 0.9327, "step": 28400 }, { "epoch": 0.7998877350547292, "eval_loss": 2.3834006786346436, "eval_runtime": 76.4589, "eval_samples_per_second": 86.439, "eval_sequential_score": 0.8250187145214471, "eval_steps_per_second": 1.36, "eval_sts-dev-1152_pearson_cosine": 0.821012856551093, "eval_sts-dev-1152_spearman_cosine": 0.8250187145214471, "eval_sts-dev-512_pearson_cosine": 0.8184871654411439, "eval_sts-dev-512_spearman_cosine": 0.823483460862761, "eval_sts-dev-768_pearson_cosine": 0.8190353483169376, "eval_sts-dev-768_spearman_cosine": 0.8237308290024404, "eval_sts-dev-960_pearson_cosine": 0.8205974063302174, "eval_sts-dev-960_spearman_cosine": 0.8245795527826257, "step": 28500 }, { "epoch": 0.8026943586865002, "grad_norm": 993543.625, "learning_rate": 1.096142451741666e-05, "loss": 0.8715, "step": 28600 }, { "epoch": 0.8083076059500421, "grad_norm": 1650682.25, "learning_rate": 1.0649577447219884e-05, "loss": 0.9566, "step": 28800 }, { "epoch": 0.8139208532135841, "grad_norm": 1086581.625, "learning_rate": 1.0337730377023108e-05, "loss": 0.9265, "step": 29000 }, { "epoch": 0.8139208532135841, "eval_loss": 2.345508337020874, "eval_runtime": 78.2528, "eval_samples_per_second": 84.457, "eval_sequential_score": 0.8306689922163598, "eval_steps_per_second": 1.329, "eval_sts-dev-1152_pearson_cosine": 0.8264443610084379, "eval_sts-dev-1152_spearman_cosine": 0.8306689922163598, "eval_sts-dev-512_pearson_cosine": 0.8238103920299558, "eval_sts-dev-512_spearman_cosine": 0.8293245725151981, "eval_sts-dev-768_pearson_cosine": 0.8243518007889306, "eval_sts-dev-768_spearman_cosine": 0.8293091429698137, "eval_sts-dev-960_pearson_cosine": 0.8258566703064338, "eval_sts-dev-960_spearman_cosine": 0.830247434103489, "step": 29000 } ], "logging_steps": 200, "max_steps": 35630, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }