diff --git a/resid_post_layer_11/trainer_0/ae.pt b/resid_post_layer_11/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f47fdd23135ce120c351a41e714224bd7d4227e --- /dev/null +++ b/resid_post_layer_11/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a577b5c6768545b6452bfea81085034cbdf1cabe79ddd9d1d61dbb74606306f +size 3758637401 diff --git a/resid_post_layer_11/trainer_0/config.json b/resid_post_layer_11/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d36dddbe4a06aa32adcb9ba611df33eec47a236 --- /dev/null +++ b/resid_post_layer_11/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 11, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_0", + "submodule_name": "resid_post_layer_11" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_0/eval_results.json b/resid_post_layer_11/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..14e66b84805cbc8c9a8282b0d1f05da167a8b14c --- /dev/null +++ b/resid_post_layer_11/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 21.99125, "l1_loss": 135.775, "l0": 29.6123583984375, "frac_variance_explained": 0.793046875, "cossim": 0.91607421875, "l2_ratio": 0.916484375, "relative_reconstruction_bias": 1.0026171875, "loss_original": 2.392890625, "loss_reconstructed": 2.449375, "loss_zero": 11.820625, "frac_recovered": 0.9944140625, "frac_alive": 0.675262451171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_1/ae.pt b/resid_post_layer_11/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ea3a30b1818229fb752ae785fed8658734d2292 --- /dev/null +++ b/resid_post_layer_11/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36bddbd229d59c11ed61f77a95a89da3ed141214e891677bb5d7facf64b57100 +size 3758637401 diff --git a/resid_post_layer_11/trainer_1/config.json b/resid_post_layer_11/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee4c78a22505b765eaaebc4d3374c5a35cab3970 --- /dev/null +++ b/resid_post_layer_11/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 11, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_1", + "submodule_name": "resid_post_layer_11" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_1/eval_results.json b/resid_post_layer_11/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..13e29df2fb59dbff8463b8a549e324f768afda67 --- /dev/null +++ b/resid_post_layer_11/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 20.0925, "l1_loss": 184.49, "l0": 58.8667578125, "frac_variance_explained": 0.82669921875, "cossim": 0.93015625, "l2_ratio": 0.93037109375, "relative_reconstruction_bias": 1.0019140625, "loss_original": 2.42234375, "loss_reconstructed": 2.4514453125, "loss_zero": 11.8234375, "frac_recovered": 0.998515625, "frac_alive": 0.763702392578125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_2/ae.pt b/resid_post_layer_11/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a966073989f0bb1cf51e633cbbbdf019af6808b --- /dev/null +++ b/resid_post_layer_11/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d0fdd0fbf3bd21fbe0b3cdd72516830b243ee54799d5e543781f7124f9f731d +size 3758637401 diff --git a/resid_post_layer_11/trainer_2/config.json b/resid_post_layer_11/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..abecc09846ec116b27bf86c526be78afe260f0f6 --- /dev/null +++ b/resid_post_layer_11/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 11, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_2", + "submodule_name": "resid_post_layer_11" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_2/eval_results.json b/resid_post_layer_11/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..97a80a20e8c624ae04af3e1e081fb21ac40bfdab --- /dev/null +++ b/resid_post_layer_11/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 18.288125, "l1_loss": 256.115, "l0": 115.6953466796875, "frac_variance_explained": 0.8566796875, "cossim": 0.94232421875, "l2_ratio": 0.94181640625, "relative_reconstruction_bias": 1.000546875, "loss_original": 2.42591796875, "loss_reconstructed": 2.4419921875, "loss_zero": 11.7809375, "frac_recovered": 0.99998046875, "frac_alive": 0.8217086791992188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_3/ae.pt b/resid_post_layer_11/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..8953d7e34bb99270c5a23e7f655d40fd0d4dff8f --- /dev/null +++ b/resid_post_layer_11/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8634b6d64435a89ee7a8ebf16d739c2b0a882eeb1e5219452c79243fea96d1de +size 3758637401 diff --git a/resid_post_layer_11/trainer_3/config.json b/resid_post_layer_11/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5a6b2ae5ca7f2bc75491fbc644cc2565494ac4 --- /dev/null +++ b/resid_post_layer_11/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 11, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_3", + "submodule_name": "resid_post_layer_11" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_11/trainer_3/eval_results.json b/resid_post_layer_11/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..024d5a99235501ece2bac8f904ecde801c88d02c --- /dev/null +++ b/resid_post_layer_11/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 16.47625, "l1_loss": 395.87, "l0": 232.93888671875, "frac_variance_explained": 0.88396484375, "cossim": 0.95333984375, "l2_ratio": 0.95359375, "relative_reconstruction_bias": 1.001640625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43615234375, "loss_zero": 11.829375, "frac_recovered": 0.999140625, "frac_alive": 0.837890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_0/ae.pt b/resid_post_layer_15/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a8cc064c32b2ac1851db4d79df7f098892b82e7 --- /dev/null +++ b/resid_post_layer_15/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1339c52258e95bc64535a90e45067187722e659e0ee03c976db813614b29ab5b +size 3758637401 diff --git a/resid_post_layer_15/trainer_0/config.json b/resid_post_layer_15/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..65d25b84d853378240b53ac714b388b5a7e008f8 --- /dev/null +++ b/resid_post_layer_15/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 15, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_0", + "submodule_name": "resid_post_layer_15" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_0/eval_results.json b/resid_post_layer_15/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d2dda6ff138549d7f922fe592889feb3f996e9ee --- /dev/null +++ b/resid_post_layer_15/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 28.453125, "l1_loss": 166.495, "l0": 29.5351513671875, "frac_variance_explained": 0.76798828125, "cossim": 0.90923828125, "l2_ratio": 0.90875, "relative_reconstruction_bias": 1.00078125, "loss_original": 2.392890625, "loss_reconstructed": 2.485078125, "loss_zero": 16.193125, "frac_recovered": 0.99515625, "frac_alive": 0.6423416137695312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_1/ae.pt b/resid_post_layer_15/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e65b9dfb44da6e06c421da2eafdeb0cf9b7419d --- /dev/null +++ b/resid_post_layer_15/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2efefadd8d85ad1a2bfcf3cb5eceb2976590dc88b4873e56b864c42c387f5cc5 +size 3758637401 diff --git a/resid_post_layer_15/trainer_1/config.json b/resid_post_layer_15/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..eafa5ee955ce5a0a4e854b59ad7315e4a6d97bf5 --- /dev/null +++ b/resid_post_layer_15/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 15, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_1", + "submodule_name": "resid_post_layer_15" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_1/eval_results.json b/resid_post_layer_15/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..be5c519618527bb75d9dd28ed1ffec62091b7617 --- /dev/null +++ b/resid_post_layer_15/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 25.911875, "l1_loss": 228.895, "l0": 59.186298828125, "frac_variance_explained": 0.8069140625, "cossim": 0.9255859375, "l2_ratio": 0.92529296875, "relative_reconstruction_bias": 1.0009375, "loss_original": 2.42234375, "loss_reconstructed": 2.47240234375, "loss_zero": 16.1503125, "frac_recovered": 1.0130078125, "frac_alive": 0.7274703979492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_2/ae.pt b/resid_post_layer_15/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c0be82a9015b13ca866488b68d1461624be61c7 --- /dev/null +++ b/resid_post_layer_15/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349e6a5713950790ed10bde7b055c214b1c5732c6c6317c673de4b8dbf35fcab +size 3758637401 diff --git a/resid_post_layer_15/trainer_2/config.json b/resid_post_layer_15/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c19b3878ce473c5c1dc9bbbef6ea962c7c4e340f --- /dev/null +++ b/resid_post_layer_15/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 15, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_2", + "submodule_name": "resid_post_layer_15" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_2/eval_results.json b/resid_post_layer_15/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f0ec10370492962c871be1040442a10be68da05c --- /dev/null +++ b/resid_post_layer_15/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 23.581875, "l1_loss": 318.67, "l0": 116.2112255859375, "frac_variance_explained": 0.840546875, "cossim": 0.9380078125, "l2_ratio": 0.937578125, "relative_reconstruction_bias": 0.999765625, "loss_original": 2.42591796875, "loss_reconstructed": 2.462734375, "loss_zero": 16.1625, "frac_recovered": 1.011953125, "frac_alive": 0.7831344604492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_3/ae.pt b/resid_post_layer_15/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..151590275a123876d02f24844811d883de404bbc --- /dev/null +++ b/resid_post_layer_15/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bab21a67e23230984c764284b9496fc8f016a5d1fb5b58180672b405878ad02 +size 3758637401 diff --git a/resid_post_layer_15/trainer_3/config.json b/resid_post_layer_15/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..48e13cc45eef15a2f674bf37c2351947ed700617 --- /dev/null +++ b/resid_post_layer_15/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 15, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_3", + "submodule_name": "resid_post_layer_15" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_15/trainer_3/eval_results.json b/resid_post_layer_15/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1f9efdddc525a19e8b56ac07fa0e42386bbbc345 --- /dev/null +++ b/resid_post_layer_15/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 21.26125, "l1_loss": 613.3, "l0": 232.84298828125, "frac_variance_explained": 0.8707421875, "cossim": 0.94994140625, "l2_ratio": 0.9498046875, "relative_reconstruction_bias": 1.0001953125, "loss_original": 2.41955078125, "loss_reconstructed": 2.4448046875, "loss_zero": 16.1625, "frac_recovered": 1.00779296875, "frac_alive": 0.7854995727539062, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_0/ae.pt b/resid_post_layer_19/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..91f9c6da27f505d006b5fc685db6cfae0d9226a6 --- /dev/null +++ b/resid_post_layer_19/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:056bdc0846c5754993987552820c2ec59611b4390ae3a073f1406b2e1030b6d5 +size 3758637401 diff --git a/resid_post_layer_19/trainer_0/config.json b/resid_post_layer_19/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e01b64d31b2a232d7200aa0950c2dbaf4272c27 --- /dev/null +++ b/resid_post_layer_19/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 19, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_0", + "submodule_name": "resid_post_layer_19" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_0/eval_results.json b/resid_post_layer_19/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2b20768d2fae059d21f392c1ac80b65f9c9fd040 --- /dev/null +++ b/resid_post_layer_19/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 39.79, "l1_loss": 240.27, "l0": 29.8869873046875, "frac_variance_explained": 0.7619921875, "cossim": 0.90751953125, "l2_ratio": 0.90701171875, "relative_reconstruction_bias": 1.00177734375, "loss_original": 2.392890625, "loss_reconstructed": 2.475546875, "loss_zero": 10.954375, "frac_recovered": 0.99154296875, "frac_alive": 0.606231689453125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_1/ae.pt b/resid_post_layer_19/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..9dbe76891d82f3bc87e3b3c95afaf9459860bfa2 --- /dev/null +++ b/resid_post_layer_19/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a90d1309919de4a7712b6d86a7d715c525c686ffe45c1255e8a689b752a8798a +size 3758637401 diff --git a/resid_post_layer_19/trainer_1/config.json b/resid_post_layer_19/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..de89cd027defb81ece18bb8b841a3302c1ea592a --- /dev/null +++ b/resid_post_layer_19/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 19, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_1", + "submodule_name": "resid_post_layer_19" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_1/eval_results.json b/resid_post_layer_19/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..92c6acc415769ac2d9cd6d10b2ae7e42b0ad6c15 --- /dev/null +++ b/resid_post_layer_19/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 35.86875, "l1_loss": 327.34, "l0": 59.9685205078125, "frac_variance_explained": 0.80572265625, "cossim": 0.9255078125, "l2_ratio": 0.92548828125, "relative_reconstruction_bias": 1.00125, "loss_original": 2.42234375, "loss_reconstructed": 2.4575, "loss_zero": 10.970625, "frac_recovered": 0.99763671875, "frac_alive": 0.6950225830078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_2/ae.pt b/resid_post_layer_19/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..03e6e72c945283f003f953430ccf48f026a21909 --- /dev/null +++ b/resid_post_layer_19/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71e0dd6904a6379470b55fe31225cb5d8abf7cf5359f778881b87b08a54e383e +size 3758637401 diff --git a/resid_post_layer_19/trainer_2/config.json b/resid_post_layer_19/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d81ab39afb03949f5095f339b1326a813eab60e5 --- /dev/null +++ b/resid_post_layer_19/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 19, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_2", + "submodule_name": "resid_post_layer_19" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_2/eval_results.json b/resid_post_layer_19/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..89b672648a4d7e41b04fe2258c678216cbbf787d --- /dev/null +++ b/resid_post_layer_19/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 32.340625, "l1_loss": 481.66, "l0": 120.2642041015625, "frac_variance_explained": 0.84236328125, "cossim": 0.93974609375, "l2_ratio": 0.9403125, "relative_reconstruction_bias": 1.001796875, "loss_original": 2.42591796875, "loss_reconstructed": 2.44478515625, "loss_zero": 10.95625, "frac_recovered": 0.99982421875, "frac_alive": 0.7388534545898438, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_3/ae.pt b/resid_post_layer_19/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..28f44268f9fe695778a0acbc0374f27bacf8c811 --- /dev/null +++ b/resid_post_layer_19/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a651eeda01072b4407e4b955ee28c04b2345419f301e8c518b2bd73f9655d271 +size 3758637401 diff --git a/resid_post_layer_19/trainer_3/config.json b/resid_post_layer_19/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec584d08bfca01ab3c7a05bf06fc6bccb2254f3b --- /dev/null +++ b/resid_post_layer_19/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 19, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_3", + "submodule_name": "resid_post_layer_19" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_19/trainer_3/eval_results.json b/resid_post_layer_19/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..76b7cdce4dccc38277628181376f9cae9d84a21b --- /dev/null +++ b/resid_post_layer_19/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 28.92875, "l1_loss": 920.9, "l0": 240.0174609375, "frac_variance_explained": 0.87439453125, "cossim": 0.95271484375, "l2_ratio": 0.9522265625, "relative_reconstruction_bias": 1.000390625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43544921875, "loss_zero": 10.966875, "frac_recovered": 1.00068359375, "frac_alive": 0.7600173950195312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_0/ae.pt b/resid_post_layer_23/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..d86142ca508ffbeb2c058917242dd914158c8b2a --- /dev/null +++ b/resid_post_layer_23/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a593d0da4a10cde4674b48749bf573b14e40119c1cff34cf2d499f334940ef4b +size 3758637401 diff --git a/resid_post_layer_23/trainer_0/config.json b/resid_post_layer_23/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..96027f5c3d777479d26fc95ddee148aeefa513c3 --- /dev/null +++ b/resid_post_layer_23/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 23, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_0", + "submodule_name": "resid_post_layer_23" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_0/eval_results.json b/resid_post_layer_23/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c6dfbbe1dca4d623ebf89f3f02ce61d7211cfd4c --- /dev/null +++ b/resid_post_layer_23/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 84.4425, "l1_loss": 532.56, "l0": 29.5232763671875, "frac_variance_explained": 0.77451171875, "cossim": 0.91509765625, "l2_ratio": 0.91498046875, "relative_reconstruction_bias": 1.001328125, "loss_original": 2.392890625, "loss_reconstructed": 2.52521484375, "loss_zero": 12.08125, "frac_recovered": 0.98640625, "frac_alive": 0.613861083984375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_1/ae.pt b/resid_post_layer_23/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..44244b54404990074bb4cdf2fb72b523c9371795 --- /dev/null +++ b/resid_post_layer_23/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd895c95a13cc1c95946014b988ed0a7e287b0f51ca04175775740a5215705b2 +size 3758637401 diff --git a/resid_post_layer_23/trainer_1/config.json b/resid_post_layer_23/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..663968fb1cd21a28d72e91ab9540d93b7ecb1920 --- /dev/null +++ b/resid_post_layer_23/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 23, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_1", + "submodule_name": "resid_post_layer_23" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_1/eval_results.json b/resid_post_layer_23/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d782d0a92a9e6e917effcffa47e96e7d3bd5aa9d --- /dev/null +++ b/resid_post_layer_23/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 75.2275, "l1_loss": 717.88, "l0": 59.1184375, "frac_variance_explained": 0.82005859375, "cossim": 0.933203125, "l2_ratio": 0.93279296875, "relative_reconstruction_bias": 1.0010546875, "loss_original": 2.42234375, "loss_reconstructed": 2.501484375, "loss_zero": 12.0415625, "frac_recovered": 1.00083984375, "frac_alive": 0.6967620849609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_2/ae.pt b/resid_post_layer_23/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd4ffc19e577b05c6c64331df9beefe3f0883696 --- /dev/null +++ b/resid_post_layer_23/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4e22f7f5db4cb313f002df7d985f4fac93e6fae37890cc43ace57af2c318d4 +size 3758637401 diff --git a/resid_post_layer_23/trainer_2/config.json b/resid_post_layer_23/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5a5c8eafe79257904ca39c095f44166cc41b02dd --- /dev/null +++ b/resid_post_layer_23/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 23, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_2", + "submodule_name": "resid_post_layer_23" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_2/eval_results.json b/resid_post_layer_23/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..704b8802418ccfabf0dad7551b845eb3784d714f --- /dev/null +++ b/resid_post_layer_23/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 66.8325, "l1_loss": 1001.98, "l0": 119.3653466796875, "frac_variance_explained": 0.85810546875, "cossim": 0.9473828125, "l2_ratio": 0.94765625, "relative_reconstruction_bias": 1.0016015625, "loss_original": 2.42591796875, "loss_reconstructed": 2.468515625, "loss_zero": 12.05875, "frac_recovered": 1.00724609375, "frac_alive": 0.7726287841796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_3/ae.pt b/resid_post_layer_23/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5ec6ae1c07f4f0054c3c51affab70ede0ccf93e --- /dev/null +++ b/resid_post_layer_23/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d035398575abe0be9544ed3b9b415402ca0e4c0fd284972d3bc242b6fe5c088 +size 3758637401 diff --git a/resid_post_layer_23/trainer_3/config.json b/resid_post_layer_23/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8612b78d4105ca13ee149d4cde3dfec9ea4fbc5b --- /dev/null +++ b/resid_post_layer_23/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 23, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_3", + "submodule_name": "resid_post_layer_23" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_23/trainer_3/eval_results.json b/resid_post_layer_23/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7710155b3f06a4699b82088034d8a40d8b1461d8 --- /dev/null +++ b/resid_post_layer_23/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 59.355, "l1_loss": 1774.04, "l0": 241.8087744140625, "frac_variance_explained": 0.88796875, "cossim": 0.95853515625, "l2_ratio": 0.95830078125, "relative_reconstruction_bias": 1.00103515625, "loss_original": 2.41955078125, "loss_reconstructed": 2.4435546875, "loss_zero": 12.0590625, "frac_recovered": 1.00390625, "frac_alive": 0.8097000122070312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_0/ae.pt b/resid_post_layer_27/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..f39c315a8dc357a68c4df93b3e320b42a74db14a --- /dev/null +++ b/resid_post_layer_27/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd70caafcecaac370b7116f39fff88a5ae2fdc53c8039ca69d9deaf06bc7bef +size 3758637401 diff --git a/resid_post_layer_27/trainer_0/config.json b/resid_post_layer_27/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..431e5af1593856f73b0fa9c243dbb3234a22d163 --- /dev/null +++ b/resid_post_layer_27/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 27, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_0", + "submodule_name": "resid_post_layer_27" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_0/eval_results.json b/resid_post_layer_27/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..09d16cad5b78008b370d53591922b4c3ca3d8008 --- /dev/null +++ b/resid_post_layer_27/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 203.285, "l1_loss": 1318.68, "l0": 26.949189453125, "frac_variance_explained": 0.7742578125, "cossim": 0.871328125, "l2_ratio": 0.86794921875, "relative_reconstruction_bias": 1.000859375, "loss_original": 2.392890625, "loss_reconstructed": 3.13078125, "loss_zero": 11.6325, "frac_recovered": 0.915869140625, "frac_alive": 0.6044921875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_1/ae.pt b/resid_post_layer_27/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..60d1892ba0fb0004644d939508139a7a03f86bd9 --- /dev/null +++ b/resid_post_layer_27/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c910992603f98c25301a19441796ce6cb3d5129b58f31fee8b925b8d5378d244 +size 3758637401 diff --git a/resid_post_layer_27/trainer_1/config.json b/resid_post_layer_27/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ad33b1c3ca710d2b6684885c604e10cdd114842 --- /dev/null +++ b/resid_post_layer_27/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 27, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_1", + "submodule_name": "resid_post_layer_27" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_1/eval_results.json b/resid_post_layer_27/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8f146fa33537c1540bba824d7293bd6547c1c0ab --- /dev/null +++ b/resid_post_layer_27/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 185.23, "l1_loss": 1765.96, "l0": 54.7008935546875, "frac_variance_explained": 0.811796875, "cossim": 0.89419921875, "l2_ratio": 0.89181640625, "relative_reconstruction_bias": 1.001640625, "loss_original": 2.42234375, "loss_reconstructed": 2.93390625, "loss_zero": 11.61625, "frac_recovered": 0.93986328125, "frac_alive": 0.7092666625976562, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_2/ae.pt b/resid_post_layer_27/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..c24aef439bba7557b49d7d8fa723305694fbf1e4 --- /dev/null +++ b/resid_post_layer_27/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40f5df62b95d97b99289364dd73f00db075dfc8aa9afac35e3d060872a285c3 +size 3758637401 diff --git a/resid_post_layer_27/trainer_2/config.json b/resid_post_layer_27/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..792c3f9f96a8914c118b02ba38622a4d9e07e19f --- /dev/null +++ b/resid_post_layer_27/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 27, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_2", + "submodule_name": "resid_post_layer_27" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_2/eval_results.json b/resid_post_layer_27/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e5114b8aeb8cc364fee62cd6ecc5a18d00a1c9 --- /dev/null +++ b/resid_post_layer_27/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 169.0, "l1_loss": 2444.72, "l0": 110.34255859375, "frac_variance_explained": 0.84341796875, "cossim": 0.91267578125, "l2_ratio": 0.9102734375, "relative_reconstruction_bias": 1.0012890625, "loss_original": 2.42591796875, "loss_reconstructed": 2.773046875, "loss_zero": 11.6109375, "frac_recovered": 0.95888671875, "frac_alive": 0.7385635375976562, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_3/ae.pt b/resid_post_layer_27/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..f34d7b1afd820bcc545663a315cd9dbbe84c3f97 --- /dev/null +++ b/resid_post_layer_27/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:884bf00cc6dc0d4591233dff9972a6d529cfaf8df1a553db923a913b341356ab +size 3758637401 diff --git a/resid_post_layer_27/trainer_3/config.json b/resid_post_layer_27/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ae06461bd8dd93bd3d5657a124b1abcf47221d88 --- /dev/null +++ b/resid_post_layer_27/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 27, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_3", + "submodule_name": "resid_post_layer_27" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_27/trainer_3/eval_results.json b/resid_post_layer_27/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ede0903b5e384c7705c206f50cde7ea85d635a49 --- /dev/null +++ b/resid_post_layer_27/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 153.63, "l1_loss": 4760.48, "l0": 223.69673828125, "frac_variance_explained": 0.87056640625, "cossim": 0.92859375, "l2_ratio": 0.92630859375, "relative_reconstruction_bias": 1.00119140625, "loss_original": 2.41955078125, "loss_reconstructed": 2.65939453125, "loss_zero": 11.6228125, "frac_recovered": 0.97171875, "frac_alive": 0.7000961303710938, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_0/ae.pt b/resid_post_layer_3/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ded30abf2294df4a4c9e99921770a27a67ef395 --- /dev/null +++ b/resid_post_layer_3/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a24ab51e131c129d22b153f8d47fdb811c611ae0762c2091e0ef6175a7259a7 +size 3758637401 diff --git a/resid_post_layer_3/trainer_0/config.json b/resid_post_layer_3/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3aae22e74740e9ac8a1d4c2b6616778675fc012 --- /dev/null +++ b/resid_post_layer_3/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 3, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l03-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_3_trainer_0", + "submodule_name": "resid_post_layer_3" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_0/eval_results.json b/resid_post_layer_3/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ddb644a0287b7dd22a8a4fe0cb9cc2e8e881e4d7 --- /dev/null +++ b/resid_post_layer_3/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.6765625, "l1_loss": 72.1825, "l0": 31.1015380859375, "frac_variance_explained": 0.9167578125, "cossim": 0.967265625, "l2_ratio": 0.9670703125, "relative_reconstruction_bias": 1.0008203125, "loss_original": 2.392890625, "loss_reconstructed": 2.38859375, "loss_zero": 12.155625, "frac_recovered": 1.00123046875, "frac_alive": 0.7425689697265625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_1/ae.pt b/resid_post_layer_3/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..200faa54e6e6abe7547ea0c4f491f4b07c61407d --- /dev/null +++ b/resid_post_layer_3/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93f70d8aac4976fa9193b3c108b85fe94b9792643fe16c43d2be160a2098ed4f +size 3758637401 diff --git a/resid_post_layer_3/trainer_1/config.json b/resid_post_layer_3/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6bf544eccb7f48327ddbb8778adc9b72db918b4 --- /dev/null +++ b/resid_post_layer_3/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 3, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l03-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_3_trainer_1", + "submodule_name": "resid_post_layer_3" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_1/eval_results.json b/resid_post_layer_3/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a9ad31ac4061387fb06f3b1994c493b7cfb166e7 --- /dev/null +++ b/resid_post_layer_3/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 6.05296875, "l1_loss": 93.62, "l0": 63.026201171875, "frac_variance_explained": 0.93087890625, "cossim": 0.97265625, "l2_ratio": 0.97267578125, "relative_reconstruction_bias": 1.001640625, "loss_original": 2.42234375, "loss_reconstructed": 2.417109375, "loss_zero": 12.15625, "frac_recovered": 1.00064453125, "frac_alive": 0.7964630126953125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_2/ae.pt b/resid_post_layer_3/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d85564b531462bc54116f2b6308a1e20a370a70 --- /dev/null +++ b/resid_post_layer_3/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:191f27054b8a91b3352565e18730f1b7b0a23fa03c4b11c3e2c0f0fc3d08e9ce +size 3758637401 diff --git a/resid_post_layer_3/trainer_2/config.json b/resid_post_layer_3/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cbe0b0367b51834c9fc209e18a755bf2ec3501cd --- /dev/null +++ b/resid_post_layer_3/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 3, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l03-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_3_trainer_2", + "submodule_name": "resid_post_layer_3" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_2/eval_results.json b/resid_post_layer_3/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..bef50f1a180768649dee9f9a8a1d052ab5c852ba --- /dev/null +++ b/resid_post_layer_3/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.518125, "l1_loss": 138.01, "l0": 126.3843115234375, "frac_variance_explained": 0.94208984375, "cossim": 0.97658203125, "l2_ratio": 0.97673828125, "relative_reconstruction_bias": 1.00125, "loss_original": 2.42591796875, "loss_reconstructed": 2.421640625, "loss_zero": 12.135, "frac_recovered": 1.00216796875, "frac_alive": 0.8279800415039062, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_3/ae.pt b/resid_post_layer_3/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f85f7c004efdce77563bcfa507db552e1bee7e3 --- /dev/null +++ b/resid_post_layer_3/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5549fcd20823c4b801ad16cde05c1215c10eb577d81f5c3c18b029a3cbdb09a +size 3758637401 diff --git a/resid_post_layer_3/trainer_3/config.json b/resid_post_layer_3/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f27dbb08e8c8637e297fb08d67fb86f37f7fd26d --- /dev/null +++ b/resid_post_layer_3/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 3, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l03-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_3_trainer_3", + "submodule_name": "resid_post_layer_3" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_3/trainer_3/eval_results.json b/resid_post_layer_3/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cf7923838a84ac2c6d73bee5a09d925852a5697a --- /dev/null +++ b/resid_post_layer_3/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 5.07796875, "l1_loss": 264.02, "l0": 255.019814453125, "frac_variance_explained": 0.95068359375, "cossim": 0.98046875, "l2_ratio": 0.98046875, "relative_reconstruction_bias": 1.0005859375, "loss_original": 2.41955078125, "loss_reconstructed": 2.41255859375, "loss_zero": 12.196875, "frac_recovered": 1.0011328125, "frac_alive": 0.82257080078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_0/ae.pt b/resid_post_layer_7/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..05d475f4087f556248b5ff58a9c77c2b3b1f7d74 --- /dev/null +++ b/resid_post_layer_7/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c88dc39dd89bc80bcea688c91739a37d1ec55bd7bf676f9cfb4bbf847fbd73 +size 3758637401 diff --git a/resid_post_layer_7/trainer_0/config.json b/resid_post_layer_7/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..987c73d18130766706db699cb344824aac1ba160 --- /dev/null +++ b/resid_post_layer_7/trainer_0/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 32, + "device": "cuda", + "layer": 7, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l07-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_7_trainer_0", + "submodule_name": "resid_post_layer_7" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_0/eval_results.json b/resid_post_layer_7/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2d1360a4f5b7107b84dd6302b5bbc89c4e1ca4 --- /dev/null +++ b/resid_post_layer_7/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 16.011875, "l1_loss": 113.375, "l0": 29.8081884765625, "frac_variance_explained": 0.8355078125, "cossim": 0.9326953125, "l2_ratio": 0.93236328125, "relative_reconstruction_bias": 1.00130859375, "loss_original": 2.392890625, "loss_reconstructed": 2.41404296875, "loss_zero": 11.4328125, "frac_recovered": 0.99724609375, "frac_alive": 0.7072906494140625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_1/ae.pt b/resid_post_layer_7/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..32304622c063d509a0053b3292ce7fccbe0608d5 --- /dev/null +++ b/resid_post_layer_7/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94be36b5ba215103b6cad7e05edaed097d677c70ff886d170a628744d6204092 +size 3758637401 diff --git a/resid_post_layer_7/trainer_1/config.json b/resid_post_layer_7/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8923424696f4a6030814a5294ec09f21fc29c341 --- /dev/null +++ b/resid_post_layer_7/trainer_1/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 64, + "device": "cuda", + "layer": 7, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l07-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_7_trainer_1", + "submodule_name": "resid_post_layer_7" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_1/eval_results.json b/resid_post_layer_7/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7cd25782ac2f78c094d6f9dfef4c7f30da919b3f --- /dev/null +++ b/resid_post_layer_7/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 14.6103125, "l1_loss": 151.455, "l0": 59.458984375, "frac_variance_explained": 0.862109375, "cossim": 0.94443359375, "l2_ratio": 0.943671875, "relative_reconstruction_bias": 1.00130859375, "loss_original": 2.42234375, "loss_reconstructed": 2.432578125, "loss_zero": 11.4065625, "frac_recovered": 1.00880859375, "frac_alive": 0.819580078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_2/ae.pt b/resid_post_layer_7/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..998286e9349c3d1de790f7aa8879d843c14f63d1 --- /dev/null +++ b/resid_post_layer_7/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868fb28583b5ac740a44ee947b66b8bf29146003c4fdfd6a7d30250c8fb6e293 +size 3758637401 diff --git a/resid_post_layer_7/trainer_2/config.json b/resid_post_layer_7/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f3956ca4927549c3629ee1f23efba974b961eb83 --- /dev/null +++ b/resid_post_layer_7/trainer_2/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 128, + "device": "cuda", + "layer": 7, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l07-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_7_trainer_2", + "submodule_name": "resid_post_layer_7" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_2/eval_results.json b/resid_post_layer_7/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..203675c73e1a15a3b49e22557f37b6aebda91053 --- /dev/null +++ b/resid_post_layer_7/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 13.3225, "l1_loss": 209.19, "l0": 118.6482177734375, "frac_variance_explained": 0.8851953125, "cossim": 0.953359375, "l2_ratio": 0.95322265625, "relative_reconstruction_bias": 1.00138671875, "loss_original": 2.42591796875, "loss_reconstructed": 2.4321484375, "loss_zero": 11.4053125, "frac_recovered": 1.0053369140625, "frac_alive": 0.898406982421875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_3/ae.pt b/resid_post_layer_7/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc9eedd50f2b7de3f9c476f820c8c7f239e24e5c --- /dev/null +++ b/resid_post_layer_7/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4fafa2b5f47f771219ae100bb6383fa265cca8115eccf76acc947ce79a86ee +size 3758637401 diff --git a/resid_post_layer_7/trainer_3/config.json b/resid_post_layer_7/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c85645b8707c5015280e818c856846120c96cb4 --- /dev/null +++ b/resid_post_layer_7/trainer_3/config.json @@ -0,0 +1,33 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0001, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1792, + "seed": 0, + "activation_dim": 3584, + "dict_size": 131072, + "k": 256, + "device": "cuda", + "layer": 7, + "lm_name": "Qwen/Qwen2.5-7B-Instruct", + "wandb_name": "5_l07-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_7_trainer_3", + "submodule_name": "resid_post_layer_7" + }, + "buffer": { + "d_submodule": 3584, + "io": "out", + "n_ctxs": 244, + "ctx_len": 1024, + "refresh_batch_size": 16, + "out_batch_size": 2048, + "device": "cuda", + "internal_device": "cuda" + } +} \ No newline at end of file diff --git a/resid_post_layer_7/trainer_3/eval_results.json b/resid_post_layer_7/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..04880dc8ad9d684d41f33e4c4593898aeeabfa2d --- /dev/null +++ b/resid_post_layer_7/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 12.11375, "l1_loss": 344.59, "l0": 232.4397509765625, "frac_variance_explained": 0.90552734375, "cossim": 0.96130859375, "l2_ratio": 0.9610546875, "relative_reconstruction_bias": 1.000859375, "loss_original": 2.41955078125, "loss_reconstructed": 2.41904296875, "loss_zero": 11.411875, "frac_recovered": 0.99695068359375, "frac_alive": 0.9295196533203125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}} \ No newline at end of file