andyrdt commited on
Commit
37a9f02
·
verified ·
1 Parent(s): 4421b72

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. resid_post_layer_11/trainer_0/ae.pt +3 -0
  2. resid_post_layer_11/trainer_0/config.json +33 -0
  3. resid_post_layer_11/trainer_0/eval_results.json +1 -0
  4. resid_post_layer_11/trainer_1/ae.pt +3 -0
  5. resid_post_layer_11/trainer_1/config.json +33 -0
  6. resid_post_layer_11/trainer_1/eval_results.json +1 -0
  7. resid_post_layer_11/trainer_2/ae.pt +3 -0
  8. resid_post_layer_11/trainer_2/config.json +33 -0
  9. resid_post_layer_11/trainer_2/eval_results.json +1 -0
  10. resid_post_layer_11/trainer_3/ae.pt +3 -0
  11. resid_post_layer_11/trainer_3/config.json +33 -0
  12. resid_post_layer_11/trainer_3/eval_results.json +1 -0
  13. resid_post_layer_15/trainer_0/ae.pt +3 -0
  14. resid_post_layer_15/trainer_0/config.json +33 -0
  15. resid_post_layer_15/trainer_0/eval_results.json +1 -0
  16. resid_post_layer_15/trainer_1/ae.pt +3 -0
  17. resid_post_layer_15/trainer_1/config.json +33 -0
  18. resid_post_layer_15/trainer_1/eval_results.json +1 -0
  19. resid_post_layer_15/trainer_2/ae.pt +3 -0
  20. resid_post_layer_15/trainer_2/config.json +33 -0
  21. resid_post_layer_15/trainer_2/eval_results.json +1 -0
  22. resid_post_layer_15/trainer_3/ae.pt +3 -0
  23. resid_post_layer_15/trainer_3/config.json +33 -0
  24. resid_post_layer_15/trainer_3/eval_results.json +1 -0
  25. resid_post_layer_19/trainer_0/ae.pt +3 -0
  26. resid_post_layer_19/trainer_0/config.json +33 -0
  27. resid_post_layer_19/trainer_0/eval_results.json +1 -0
  28. resid_post_layer_19/trainer_1/ae.pt +3 -0
  29. resid_post_layer_19/trainer_1/config.json +33 -0
  30. resid_post_layer_19/trainer_1/eval_results.json +1 -0
  31. resid_post_layer_19/trainer_2/ae.pt +3 -0
  32. resid_post_layer_19/trainer_2/config.json +33 -0
  33. resid_post_layer_19/trainer_2/eval_results.json +1 -0
  34. resid_post_layer_19/trainer_3/ae.pt +3 -0
  35. resid_post_layer_19/trainer_3/config.json +33 -0
  36. resid_post_layer_19/trainer_3/eval_results.json +1 -0
  37. resid_post_layer_23/trainer_0/ae.pt +3 -0
  38. resid_post_layer_23/trainer_0/config.json +33 -0
  39. resid_post_layer_23/trainer_0/eval_results.json +1 -0
  40. resid_post_layer_23/trainer_1/ae.pt +3 -0
  41. resid_post_layer_23/trainer_1/config.json +33 -0
  42. resid_post_layer_23/trainer_1/eval_results.json +1 -0
  43. resid_post_layer_23/trainer_2/ae.pt +3 -0
  44. resid_post_layer_23/trainer_2/config.json +33 -0
  45. resid_post_layer_23/trainer_2/eval_results.json +1 -0
  46. resid_post_layer_23/trainer_3/ae.pt +3 -0
  47. resid_post_layer_23/trainer_3/config.json +33 -0
  48. resid_post_layer_23/trainer_3/eval_results.json +1 -0
  49. resid_post_layer_27/trainer_0/ae.pt +3 -0
  50. resid_post_layer_27/trainer_0/config.json +33 -0
resid_post_layer_11/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a577b5c6768545b6452bfea81085034cbdf1cabe79ddd9d1d61dbb74606306f
3
+ size 3758637401
resid_post_layer_11/trainer_0/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 32,
17
+ "device": "cuda",
18
+ "layer": 11,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_0",
21
+ "submodule_name": "resid_post_layer_11"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_11/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 21.99125, "l1_loss": 135.775, "l0": 29.6123583984375, "frac_variance_explained": 0.793046875, "cossim": 0.91607421875, "l2_ratio": 0.916484375, "relative_reconstruction_bias": 1.0026171875, "loss_original": 2.392890625, "loss_reconstructed": 2.449375, "loss_zero": 11.820625, "frac_recovered": 0.9944140625, "frac_alive": 0.675262451171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_11/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36bddbd229d59c11ed61f77a95a89da3ed141214e891677bb5d7facf64b57100
3
+ size 3758637401
resid_post_layer_11/trainer_1/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 64,
17
+ "device": "cuda",
18
+ "layer": 11,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_1",
21
+ "submodule_name": "resid_post_layer_11"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_11/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 20.0925, "l1_loss": 184.49, "l0": 58.8667578125, "frac_variance_explained": 0.82669921875, "cossim": 0.93015625, "l2_ratio": 0.93037109375, "relative_reconstruction_bias": 1.0019140625, "loss_original": 2.42234375, "loss_reconstructed": 2.4514453125, "loss_zero": 11.8234375, "frac_recovered": 0.998515625, "frac_alive": 0.763702392578125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_11/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0fdd0fbf3bd21fbe0b3cdd72516830b243ee54799d5e543781f7124f9f731d
3
+ size 3758637401
resid_post_layer_11/trainer_2/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 128,
17
+ "device": "cuda",
18
+ "layer": 11,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_2",
21
+ "submodule_name": "resid_post_layer_11"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_11/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 18.288125, "l1_loss": 256.115, "l0": 115.6953466796875, "frac_variance_explained": 0.8566796875, "cossim": 0.94232421875, "l2_ratio": 0.94181640625, "relative_reconstruction_bias": 1.000546875, "loss_original": 2.42591796875, "loss_reconstructed": 2.4419921875, "loss_zero": 11.7809375, "frac_recovered": 0.99998046875, "frac_alive": 0.8217086791992188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_11/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8634b6d64435a89ee7a8ebf16d739c2b0a882eeb1e5219452c79243fea96d1de
3
+ size 3758637401
resid_post_layer_11/trainer_3/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 256,
17
+ "device": "cuda",
18
+ "layer": 11,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l11-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_11_trainer_3",
21
+ "submodule_name": "resid_post_layer_11"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_11/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 16.47625, "l1_loss": 395.87, "l0": 232.93888671875, "frac_variance_explained": 0.88396484375, "cossim": 0.95333984375, "l2_ratio": 0.95359375, "relative_reconstruction_bias": 1.001640625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43615234375, "loss_zero": 11.829375, "frac_recovered": 0.999140625, "frac_alive": 0.837890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_15/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1339c52258e95bc64535a90e45067187722e659e0ee03c976db813614b29ab5b
3
+ size 3758637401
resid_post_layer_15/trainer_0/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 32,
17
+ "device": "cuda",
18
+ "layer": 15,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_0",
21
+ "submodule_name": "resid_post_layer_15"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_15/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 28.453125, "l1_loss": 166.495, "l0": 29.5351513671875, "frac_variance_explained": 0.76798828125, "cossim": 0.90923828125, "l2_ratio": 0.90875, "relative_reconstruction_bias": 1.00078125, "loss_original": 2.392890625, "loss_reconstructed": 2.485078125, "loss_zero": 16.193125, "frac_recovered": 0.99515625, "frac_alive": 0.6423416137695312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_15/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2efefadd8d85ad1a2bfcf3cb5eceb2976590dc88b4873e56b864c42c387f5cc5
3
+ size 3758637401
resid_post_layer_15/trainer_1/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 64,
17
+ "device": "cuda",
18
+ "layer": 15,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_1",
21
+ "submodule_name": "resid_post_layer_15"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_15/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 25.911875, "l1_loss": 228.895, "l0": 59.186298828125, "frac_variance_explained": 0.8069140625, "cossim": 0.9255859375, "l2_ratio": 0.92529296875, "relative_reconstruction_bias": 1.0009375, "loss_original": 2.42234375, "loss_reconstructed": 2.47240234375, "loss_zero": 16.1503125, "frac_recovered": 1.0130078125, "frac_alive": 0.7274703979492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_15/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349e6a5713950790ed10bde7b055c214b1c5732c6c6317c673de4b8dbf35fcab
3
+ size 3758637401
resid_post_layer_15/trainer_2/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 128,
17
+ "device": "cuda",
18
+ "layer": 15,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_2",
21
+ "submodule_name": "resid_post_layer_15"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_15/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 23.581875, "l1_loss": 318.67, "l0": 116.2112255859375, "frac_variance_explained": 0.840546875, "cossim": 0.9380078125, "l2_ratio": 0.937578125, "relative_reconstruction_bias": 0.999765625, "loss_original": 2.42591796875, "loss_reconstructed": 2.462734375, "loss_zero": 16.1625, "frac_recovered": 1.011953125, "frac_alive": 0.7831344604492188, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_15/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bab21a67e23230984c764284b9496fc8f016a5d1fb5b58180672b405878ad02
3
+ size 3758637401
resid_post_layer_15/trainer_3/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 256,
17
+ "device": "cuda",
18
+ "layer": 15,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l15-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_15_trainer_3",
21
+ "submodule_name": "resid_post_layer_15"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_15/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 21.26125, "l1_loss": 613.3, "l0": 232.84298828125, "frac_variance_explained": 0.8707421875, "cossim": 0.94994140625, "l2_ratio": 0.9498046875, "relative_reconstruction_bias": 1.0001953125, "loss_original": 2.41955078125, "loss_reconstructed": 2.4448046875, "loss_zero": 16.1625, "frac_recovered": 1.00779296875, "frac_alive": 0.7854995727539062, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_19/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056bdc0846c5754993987552820c2ec59611b4390ae3a073f1406b2e1030b6d5
3
+ size 3758637401
resid_post_layer_19/trainer_0/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 32,
17
+ "device": "cuda",
18
+ "layer": 19,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_0",
21
+ "submodule_name": "resid_post_layer_19"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_19/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 39.79, "l1_loss": 240.27, "l0": 29.8869873046875, "frac_variance_explained": 0.7619921875, "cossim": 0.90751953125, "l2_ratio": 0.90701171875, "relative_reconstruction_bias": 1.00177734375, "loss_original": 2.392890625, "loss_reconstructed": 2.475546875, "loss_zero": 10.954375, "frac_recovered": 0.99154296875, "frac_alive": 0.606231689453125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_19/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a90d1309919de4a7712b6d86a7d715c525c686ffe45c1255e8a689b752a8798a
3
+ size 3758637401
resid_post_layer_19/trainer_1/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 64,
17
+ "device": "cuda",
18
+ "layer": 19,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_1",
21
+ "submodule_name": "resid_post_layer_19"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_19/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 35.86875, "l1_loss": 327.34, "l0": 59.9685205078125, "frac_variance_explained": 0.80572265625, "cossim": 0.9255078125, "l2_ratio": 0.92548828125, "relative_reconstruction_bias": 1.00125, "loss_original": 2.42234375, "loss_reconstructed": 2.4575, "loss_zero": 10.970625, "frac_recovered": 0.99763671875, "frac_alive": 0.6950225830078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_19/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71e0dd6904a6379470b55fe31225cb5d8abf7cf5359f778881b87b08a54e383e
3
+ size 3758637401
resid_post_layer_19/trainer_2/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 128,
17
+ "device": "cuda",
18
+ "layer": 19,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_2",
21
+ "submodule_name": "resid_post_layer_19"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_19/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 32.340625, "l1_loss": 481.66, "l0": 120.2642041015625, "frac_variance_explained": 0.84236328125, "cossim": 0.93974609375, "l2_ratio": 0.9403125, "relative_reconstruction_bias": 1.001796875, "loss_original": 2.42591796875, "loss_reconstructed": 2.44478515625, "loss_zero": 10.95625, "frac_recovered": 0.99982421875, "frac_alive": 0.7388534545898438, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_19/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a651eeda01072b4407e4b955ee28c04b2345419f301e8c518b2bd73f9655d271
3
+ size 3758637401
resid_post_layer_19/trainer_3/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 256,
17
+ "device": "cuda",
18
+ "layer": 19,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l19-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_19_trainer_3",
21
+ "submodule_name": "resid_post_layer_19"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_19/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 28.92875, "l1_loss": 920.9, "l0": 240.0174609375, "frac_variance_explained": 0.87439453125, "cossim": 0.95271484375, "l2_ratio": 0.9522265625, "relative_reconstruction_bias": 1.000390625, "loss_original": 2.41955078125, "loss_reconstructed": 2.43544921875, "loss_zero": 10.966875, "frac_recovered": 1.00068359375, "frac_alive": 0.7600173950195312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_23/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a593d0da4a10cde4674b48749bf573b14e40119c1cff34cf2d499f334940ef4b
3
+ size 3758637401
resid_post_layer_23/trainer_0/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 32,
17
+ "device": "cuda",
18
+ "layer": 23,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_0",
21
+ "submodule_name": "resid_post_layer_23"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_23/trainer_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 84.4425, "l1_loss": 532.56, "l0": 29.5232763671875, "frac_variance_explained": 0.77451171875, "cossim": 0.91509765625, "l2_ratio": 0.91498046875, "relative_reconstruction_bias": 1.001328125, "loss_original": 2.392890625, "loss_reconstructed": 2.52521484375, "loss_zero": 12.08125, "frac_recovered": 0.98640625, "frac_alive": 0.613861083984375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_23/trainer_1/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd895c95a13cc1c95946014b988ed0a7e287b0f51ca04175775740a5215705b2
3
+ size 3758637401
resid_post_layer_23/trainer_1/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 64,
17
+ "device": "cuda",
18
+ "layer": 23,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_1",
21
+ "submodule_name": "resid_post_layer_23"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_23/trainer_1/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 75.2275, "l1_loss": 717.88, "l0": 59.1184375, "frac_variance_explained": 0.82005859375, "cossim": 0.933203125, "l2_ratio": 0.93279296875, "relative_reconstruction_bias": 1.0010546875, "loss_original": 2.42234375, "loss_reconstructed": 2.501484375, "loss_zero": 12.0415625, "frac_recovered": 1.00083984375, "frac_alive": 0.6967620849609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_23/trainer_2/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e4e22f7f5db4cb313f002df7d985f4fac93e6fae37890cc43ace57af2c318d4
3
+ size 3758637401
resid_post_layer_23/trainer_2/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 128,
17
+ "device": "cuda",
18
+ "layer": 23,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_2",
21
+ "submodule_name": "resid_post_layer_23"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_23/trainer_2/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 66.8325, "l1_loss": 1001.98, "l0": 119.3653466796875, "frac_variance_explained": 0.85810546875, "cossim": 0.9473828125, "l2_ratio": 0.94765625, "relative_reconstruction_bias": 1.0016015625, "loss_original": 2.42591796875, "loss_reconstructed": 2.468515625, "loss_zero": 12.05875, "frac_recovered": 1.00724609375, "frac_alive": 0.7726287841796875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_23/trainer_3/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d035398575abe0be9544ed3b9b415402ca0e4c0fd284972d3bc242b6fe5c088
3
+ size 3758637401
resid_post_layer_23/trainer_3/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 256,
17
+ "device": "cuda",
18
+ "layer": 23,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l23-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_23_trainer_3",
21
+ "submodule_name": "resid_post_layer_23"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }
resid_post_layer_23/trainer_3/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 59.355, "l1_loss": 1774.04, "l0": 241.8087744140625, "frac_variance_explained": 0.88796875, "cossim": 0.95853515625, "l2_ratio": 0.95830078125, "relative_reconstruction_bias": 1.00103515625, "loss_original": 2.41955078125, "loss_reconstructed": 2.4435546875, "loss_zero": 12.0590625, "frac_recovered": 1.00390625, "frac_alive": 0.8097000122070312, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
resid_post_layer_27/trainer_0/ae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd70caafcecaac370b7116f39fff88a5ae2fdc53c8039ca69d9deaf06bc7bef
3
+ size 3758637401
resid_post_layer_27/trainer_0/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "trainer_class": "BatchTopKTrainer",
4
+ "dict_class": "BatchTopKSAE",
5
+ "lr": 0.0001,
6
+ "steps": 244140,
7
+ "auxk_alpha": 0.03125,
8
+ "warmup_steps": 1000,
9
+ "decay_start": 195312,
10
+ "threshold_beta": 0.999,
11
+ "threshold_start_step": 1000,
12
+ "top_k_aux": 1792,
13
+ "seed": 0,
14
+ "activation_dim": 3584,
15
+ "dict_size": 131072,
16
+ "k": 32,
17
+ "device": "cuda",
18
+ "layer": 27,
19
+ "lm_name": "Qwen/Qwen2.5-7B-Instruct",
20
+ "wandb_name": "5_l27-BatchTopKTrainer-Qwen/Qwen2.5-7B-Instruct-resid_post_layer_27_trainer_0",
21
+ "submodule_name": "resid_post_layer_27"
22
+ },
23
+ "buffer": {
24
+ "d_submodule": 3584,
25
+ "io": "out",
26
+ "n_ctxs": 244,
27
+ "ctx_len": 1024,
28
+ "refresh_batch_size": 16,
29
+ "out_batch_size": 2048,
30
+ "device": "cuda",
31
+ "internal_device": "cuda"
32
+ }
33
+ }