andyrdt
/

saes-qwen2.5-7b-instruct

Model card Files Files and versions Community

andyrdt commited on May 26

Commit

37a9f02

verified ·

1 Parent(s): 4421b72

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

resid_post_layer_11/trainer_0/ae.pt +3 -0
resid_post_layer_11/trainer_0/config.json +33 -0
resid_post_layer_11/trainer_0/eval_results.json +1 -0
resid_post_layer_11/trainer_1/ae.pt +3 -0
resid_post_layer_11/trainer_1/config.json +33 -0
resid_post_layer_11/trainer_1/eval_results.json +1 -0
resid_post_layer_11/trainer_2/ae.pt +3 -0
resid_post_layer_11/trainer_2/config.json +33 -0
resid_post_layer_11/trainer_2/eval_results.json +1 -0
resid_post_layer_11/trainer_3/ae.pt +3 -0
resid_post_layer_11/trainer_3/config.json +33 -0
resid_post_layer_11/trainer_3/eval_results.json +1 -0
resid_post_layer_15/trainer_0/ae.pt +3 -0
resid_post_layer_15/trainer_0/config.json +33 -0
resid_post_layer_15/trainer_0/eval_results.json +1 -0
resid_post_layer_15/trainer_1/ae.pt +3 -0
resid_post_layer_15/trainer_1/config.json +33 -0
resid_post_layer_15/trainer_1/eval_results.json +1 -0
resid_post_layer_15/trainer_2/ae.pt +3 -0
resid_post_layer_15/trainer_2/config.json +33 -0
resid_post_layer_15/trainer_2/eval_results.json +1 -0
resid_post_layer_15/trainer_3/ae.pt +3 -0
resid_post_layer_15/trainer_3/config.json +33 -0
resid_post_layer_15/trainer_3/eval_results.json +1 -0
resid_post_layer_19/trainer_0/ae.pt +3 -0
resid_post_layer_19/trainer_0/config.json +33 -0
resid_post_layer_19/trainer_0/eval_results.json +1 -0
resid_post_layer_19/trainer_1/ae.pt +3 -0
resid_post_layer_19/trainer_1/config.json +33 -0
resid_post_layer_19/trainer_1/eval_results.json +1 -0
resid_post_layer_19/trainer_2/ae.pt +3 -0
resid_post_layer_19/trainer_2/config.json +33 -0
resid_post_layer_19/trainer_2/eval_results.json +1 -0
resid_post_layer_19/trainer_3/ae.pt +3 -0
resid_post_layer_19/trainer_3/config.json +33 -0
resid_post_layer_19/trainer_3/eval_results.json +1 -0
resid_post_layer_23/trainer_0/ae.pt +3 -0
resid_post_layer_23/trainer_0/config.json +33 -0
resid_post_layer_23/trainer_0/eval_results.json +1 -0
resid_post_layer_23/trainer_1/ae.pt +3 -0
resid_post_layer_23/trainer_1/config.json +33 -0
resid_post_layer_23/trainer_1/eval_results.json +1 -0
resid_post_layer_23/trainer_2/ae.pt +3 -0
resid_post_layer_23/trainer_2/config.json +33 -0
resid_post_layer_23/trainer_2/eval_results.json +1 -0
resid_post_layer_23/trainer_3/ae.pt +3 -0
resid_post_layer_23/trainer_3/config.json +33 -0
resid_post_layer_23/trainer_3/eval_results.json +1 -0
resid_post_layer_27/trainer_0/ae.pt +3 -0
resid_post_layer_27/trainer_0/config.json +33 -0

resid_post_layer_11/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a577b5c6768545b6452bfea81085034cbdf1cabe79ddd9d1d61dbb74606306f
+size 3758637401

resid_post_layer_11/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 32,
+        "device": "cuda",
+        "layer": 11,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_0",
+        "submodule_name": "resid_post_layer_11"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_11/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 21.99125, "l1_loss": 135.775, "l0": 29.6123583984375, "frac_variance_explained": 0.793046875, "cossim": 0.91607421875, "l2_ratio": 0.916484375, "relative_reconstruction_bias": 1.0026171875, "loss_original": 2.392890625, "loss_reconstructed": 2.449375, "loss_zero": 11.820625, "frac_recovered": 0.9944140625, "frac_alive": 0.675262451171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_11/trainer_1/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36bddbd229d59c11ed61f77a95a89da3ed141214e891677bb5d7facf64b57100
+size 3758637401

resid_post_layer_11/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 64,
+        "device": "cuda",
+        "layer": 11,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_1",
+        "submodule_name": "resid_post_layer_11"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_11/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 20.0925, "l1_loss": 184.49, "l0": 58.8667578125, "frac_variance_explained": 0.82669921875, "cossim": 0.93015625, "l2_ratio": 0.93037109375, "relative_reconstruction_bias": 1.0019140625, "loss_original": 2.42234375, "loss_reconstructed": 2.4514453125, "loss_zero": 11.8234375, "frac_recovered": 0.998515625, "frac_alive": 0.763702392578125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_11/trainer_2/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d0fdd0fbf3bd21fbe0b3cdd72516830b243ee54799d5e543781f7124f9f731d
+size 3758637401

resid_post_layer_11/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 128,
+        "device": "cuda",
+        "layer": 11,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_2",
+        "submodule_name": "resid_post_layer_11"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_11/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 18.288125, "l1_loss": 256.115, "l0": 115.6953466796875, "frac_variance_explained": 0.8566796875, "cossim": 0.94232421875, "l2_ratio": 0.94181640625, "relative_reconstruction_bias": 1.000546875, "loss_original": 2.42591796875, "loss_reconstructed": 2.4419921875, "loss_zero": 11.7809375, "frac_recovered": 0.99998046875, "frac_alive": 0.8217086791992188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_11/trainer_3/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8634b6d64435a89ee7a8ebf16d739c2b0a882eeb1e5219452c79243fea96d1de
+size 3758637401

resid_post_layer_11/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 256,
+        "device": "cuda",
+        "layer": 11,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_3",
+        "submodule_name": "resid_post_layer_11"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_11/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 16.47625, "l1_loss": 395.87, "l0": 232.93888671875, "frac_variance_explained": 0.88396484375, "cossim": 0.95333984375, "l2_ratio": 0.95359375, "relative_reconstruction_bias": 1.001640625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43615234375, "loss_zero": 11.829375, "frac_recovered": 0.999140625, "frac_alive": 0.837890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_15/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1339c52258e95bc64535a90e45067187722e659e0ee03c976db813614b29ab5b
+size 3758637401

resid_post_layer_15/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 32,
+        "device": "cuda",
+        "layer": 15,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_0",
+        "submodule_name": "resid_post_layer_15"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_15/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 28.453125, "l1_loss": 166.495, "l0": 29.5351513671875, "frac_variance_explained": 0.76798828125, "cossim": 0.90923828125, "l2_ratio": 0.90875, "relative_reconstruction_bias": 1.00078125, "loss_original": 2.392890625, "loss_reconstructed": 2.485078125, "loss_zero": 16.193125, "frac_recovered": 0.99515625, "frac_alive": 0.6423416137695312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_15/trainer_1/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2efefadd8d85ad1a2bfcf3cb5eceb2976590dc88b4873e56b864c42c387f5cc5
+size 3758637401

resid_post_layer_15/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 64,
+        "device": "cuda",
+        "layer": 15,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_1",
+        "submodule_name": "resid_post_layer_15"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_15/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 25.911875, "l1_loss": 228.895, "l0": 59.186298828125, "frac_variance_explained": 0.8069140625, "cossim": 0.9255859375, "l2_ratio": 0.92529296875, "relative_reconstruction_bias": 1.0009375, "loss_original": 2.42234375, "loss_reconstructed": 2.47240234375, "loss_zero": 16.1503125, "frac_recovered": 1.0130078125, "frac_alive": 0.7274703979492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_15/trainer_2/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:349e6a5713950790ed10bde7b055c214b1c5732c6c6317c673de4b8dbf35fcab
+size 3758637401

resid_post_layer_15/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 128,
+        "device": "cuda",
+        "layer": 15,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_2",
+        "submodule_name": "resid_post_layer_15"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_15/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 23.581875, "l1_loss": 318.67, "l0": 116.2112255859375, "frac_variance_explained": 0.840546875, "cossim": 0.9380078125, "l2_ratio": 0.937578125, "relative_reconstruction_bias": 0.999765625, "loss_original": 2.42591796875, "loss_reconstructed": 2.462734375, "loss_zero": 16.1625, "frac_recovered": 1.011953125, "frac_alive": 0.7831344604492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_15/trainer_3/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bab21a67e23230984c764284b9496fc8f016a5d1fb5b58180672b405878ad02
+size 3758637401

resid_post_layer_15/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 256,
+        "device": "cuda",
+        "layer": 15,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_3",
+        "submodule_name": "resid_post_layer_15"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_15/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 21.26125, "l1_loss": 613.3, "l0": 232.84298828125, "frac_variance_explained": 0.8707421875, "cossim": 0.94994140625, "l2_ratio": 0.9498046875, "relative_reconstruction_bias": 1.0001953125, "loss_original": 2.41955078125, "loss_reconstructed": 2.4448046875, "loss_zero": 16.1625, "frac_recovered": 1.00779296875, "frac_alive": 0.7854995727539062, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_19/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:056bdc0846c5754993987552820c2ec59611b4390ae3a073f1406b2e1030b6d5
+size 3758637401

resid_post_layer_19/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 32,
+        "device": "cuda",
+        "layer": 19,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_0",
+        "submodule_name": "resid_post_layer_19"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_19/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 39.79, "l1_loss": 240.27, "l0": 29.8869873046875, "frac_variance_explained": 0.7619921875, "cossim": 0.90751953125, "l2_ratio": 0.90701171875, "relative_reconstruction_bias": 1.00177734375, "loss_original": 2.392890625, "loss_reconstructed": 2.475546875, "loss_zero": 10.954375, "frac_recovered": 0.99154296875, "frac_alive": 0.606231689453125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_19/trainer_1/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a90d1309919de4a7712b6d86a7d715c525c686ffe45c1255e8a689b752a8798a
+size 3758637401

resid_post_layer_19/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 64,
+        "device": "cuda",
+        "layer": 19,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_1",
+        "submodule_name": "resid_post_layer_19"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_19/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 35.86875, "l1_loss": 327.34, "l0": 59.9685205078125, "frac_variance_explained": 0.80572265625, "cossim": 0.9255078125, "l2_ratio": 0.92548828125, "relative_reconstruction_bias": 1.00125, "loss_original": 2.42234375, "loss_reconstructed": 2.4575, "loss_zero": 10.970625, "frac_recovered": 0.99763671875, "frac_alive": 0.6950225830078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_19/trainer_2/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71e0dd6904a6379470b55fe31225cb5d8abf7cf5359f778881b87b08a54e383e
+size 3758637401

resid_post_layer_19/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 128,
+        "device": "cuda",
+        "layer": 19,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_2",
+        "submodule_name": "resid_post_layer_19"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_19/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 32.340625, "l1_loss": 481.66, "l0": 120.2642041015625, "frac_variance_explained": 0.84236328125, "cossim": 0.93974609375, "l2_ratio": 0.9403125, "relative_reconstruction_bias": 1.001796875, "loss_original": 2.42591796875, "loss_reconstructed": 2.44478515625, "loss_zero": 10.95625, "frac_recovered": 0.99982421875, "frac_alive": 0.7388534545898438, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_19/trainer_3/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a651eeda01072b4407e4b955ee28c04b2345419f301e8c518b2bd73f9655d271
+size 3758637401

resid_post_layer_19/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 256,
+        "device": "cuda",
+        "layer": 19,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_3",
+        "submodule_name": "resid_post_layer_19"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_19/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 28.92875, "l1_loss": 920.9, "l0": 240.0174609375, "frac_variance_explained": 0.87439453125, "cossim": 0.95271484375, "l2_ratio": 0.9522265625, "relative_reconstruction_bias": 1.000390625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43544921875, "loss_zero": 10.966875, "frac_recovered": 1.00068359375, "frac_alive": 0.7600173950195312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_23/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a593d0da4a10cde4674b48749bf573b14e40119c1cff34cf2d499f334940ef4b
+size 3758637401

resid_post_layer_23/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 32,
+        "device": "cuda",
+        "layer": 23,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_0",
+        "submodule_name": "resid_post_layer_23"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_23/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 84.4425, "l1_loss": 532.56, "l0": 29.5232763671875, "frac_variance_explained": 0.77451171875, "cossim": 0.91509765625, "l2_ratio": 0.91498046875, "relative_reconstruction_bias": 1.001328125, "loss_original": 2.392890625, "loss_reconstructed": 2.52521484375, "loss_zero": 12.08125, "frac_recovered": 0.98640625, "frac_alive": 0.613861083984375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_23/trainer_1/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd895c95a13cc1c95946014b988ed0a7e287b0f51ca04175775740a5215705b2
+size 3758637401

resid_post_layer_23/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 64,
+        "device": "cuda",
+        "layer": 23,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_1",
+        "submodule_name": "resid_post_layer_23"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_23/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 75.2275, "l1_loss": 717.88, "l0": 59.1184375, "frac_variance_explained": 0.82005859375, "cossim": 0.933203125, "l2_ratio": 0.93279296875, "relative_reconstruction_bias": 1.0010546875, "loss_original": 2.42234375, "loss_reconstructed": 2.501484375, "loss_zero": 12.0415625, "frac_recovered": 1.00083984375, "frac_alive": 0.6967620849609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_23/trainer_2/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4e22f7f5db4cb313f002df7d985f4fac93e6fae37890cc43ace57af2c318d4
+size 3758637401

resid_post_layer_23/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 128,
+        "device": "cuda",
+        "layer": 23,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_2",
+        "submodule_name": "resid_post_layer_23"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_23/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 66.8325, "l1_loss": 1001.98, "l0": 119.3653466796875, "frac_variance_explained": 0.85810546875, "cossim": 0.9473828125, "l2_ratio": 0.94765625, "relative_reconstruction_bias": 1.0016015625, "loss_original": 2.42591796875, "loss_reconstructed": 2.468515625, "loss_zero": 12.05875, "frac_recovered": 1.00724609375, "frac_alive": 0.7726287841796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_23/trainer_3/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d035398575abe0be9544ed3b9b415402ca0e4c0fd284972d3bc242b6fe5c088
+size 3758637401

resid_post_layer_23/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 256,
+        "device": "cuda",
+        "layer": 23,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_3",
+        "submodule_name": "resid_post_layer_23"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}

resid_post_layer_23/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"l2_loss": 59.355, "l1_loss": 1774.04, "l0": 241.8087744140625, "frac_variance_explained": 0.88796875, "cossim": 0.95853515625, "l2_ratio": 0.95830078125, "relative_reconstruction_bias": 1.00103515625, "loss_original": 2.41955078125, "loss_reconstructed": 2.4435546875, "loss_zero": 12.0590625, "frac_recovered": 1.00390625, "frac_alive": 0.8097000122070312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}

resid_post_layer_27/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd70caafcecaac370b7116f39fff88a5ae2fdc53c8039ca69d9deaf06bc7bef
+size 3758637401

resid_post_layer_27/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0001,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1792,
+        "seed": 0,
+        "activation_dim": 3584,
+        "dict_size": 131072,
+        "k": 32,
+        "device": "cuda",
+        "layer": 27,
+        "lm_name": "Qwen/Qwen2.5-7B-Instruct",
+        "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_0",
+        "submodule_name": "resid_post_layer_27"
+    },
+    "buffer": {
+        "d_submodule": 3584,
+        "io": "out",
+        "n_ctxs": 244,
+        "ctx_len": 1024,
+        "refresh_batch_size": 16,
+        "out_batch_size": 2048,
+        "device": "cuda",
+        "internal_device": "cuda"
+    }
+}