Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/eval_results.json +1 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/eval_results.json +1 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/eval_results.json +1 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/eval_results.json +1 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/eval_results.json +1 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/ae.pt +3 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/config.json +32 -0
- gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/eval_results.json +1 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_0/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_0/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_244/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_244/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_2441/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_2441/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_24414/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_24414/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_772/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_772/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_7720/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_7720/config.json +31 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_77203/ae.pt +3 -0
- gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_77203/config.json +31 -0
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc8f9d716c77c9ae84b25674b076b969f4e52947c94b3a450f4214bfe26f37be
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 320,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_10",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_10/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 42.79875, "l1_loss": 1562.96, "l0": 321.2723876953125, "frac_variance_explained": 0.90958984375, "cossim": 0.953125, "l2_ratio": 0.95298828125, "relative_reconstruction_bias": 1.00021484375, "loss_original": 2.152294921875, "loss_reconstructed": 2.18591796875, "loss_zero": 12.4375, "frac_recovered": 0.996171875, "frac_alive": 0.96368408203125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe22ddf8d9511da263f3106bc3fbaf500b6dc0701f6c1085d5f7b47ac6415447
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 640,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_11",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_11/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 34.8, "l1_loss": 2747.28, "l0": 642.0887109375, "frac_variance_explained": 0.94068359375, "cossim": 0.968359375, "l2_ratio": 0.9684765625, "relative_reconstruction_bias": 0.999453125, "loss_original": 2.152294921875, "loss_reconstructed": 2.16748046875, "loss_zero": 12.4375, "frac_recovered": 0.99841796875, "frac_alive": 0.83428955078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01638d64af64ef161f923e2ca4d20742581452606d7cb0f491d66c146ed1962a
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 20,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_6",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_6/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 64.55875, "l1_loss": 286.01, "l0": 19.994365234375, "frac_variance_explained": 0.790859375, "cossim": 0.89044921875, "l2_ratio": 0.890546875, "relative_reconstruction_bias": 1.00109375, "loss_original": 2.152294921875, "loss_reconstructed": 2.385322265625, "loss_zero": 12.4375, "frac_recovered": 0.97697265625, "frac_alive": 0.97857666015625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5477b613d15f981e2767354521bafe5cbc85e33189b6064b0a327d0ff40f04bd
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 40,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_7",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_7/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 59.685, "l1_loss": 387.52, "l0": 39.811513671875, "frac_variance_explained": 0.82166015625, "cossim": 0.90671875, "l2_ratio": 0.90642578125, "relative_reconstruction_bias": 1.000546875, "loss_original": 2.152294921875, "loss_reconstructed": 2.2889453125, "loss_zero": 12.4375, "frac_recovered": 0.98634765625, "frac_alive": 0.98846435546875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7dab4effd204e72a9f6df3c3ba4ae455f86864b10c10541f26f83cb8ed961a49
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 80,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_8",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_8/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 54.58, "l1_loss": 543.6, "l0": 80.091689453125, "frac_variance_explained": 0.8526171875, "cossim": 0.9221484375, "l2_ratio": 0.92234375, "relative_reconstruction_bias": 1.00125, "loss_original": 2.152294921875, "loss_reconstructed": 2.23978515625, "loss_zero": 12.4375, "frac_recovered": 0.9907421875, "frac_alive": 0.99298095703125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcb46b6ec70f4b971faf416bceb715ba7aeba7850241111a62255128c3d57f13
|
3 |
+
size 302066710
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "BatchTopKTrainer",
|
4 |
+
"dict_class": "BatchTopKSAE",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"top_k_aux": 1152,
|
13 |
+
"seed": 0,
|
14 |
+
"activation_dim": 2304,
|
15 |
+
"dict_size": 16384,
|
16 |
+
"k": 160,
|
17 |
+
"device": "cuda:0",
|
18 |
+
"layer": 12,
|
19 |
+
"lm_name": "google/gemma-2-2b",
|
20 |
+
"wandb_name": "BatchTopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_9",
|
21 |
+
"submodule_name": "resid_post_layer_12"
|
22 |
+
},
|
23 |
+
"buffer": {
|
24 |
+
"d_submodule": 2304,
|
25 |
+
"io": "out",
|
26 |
+
"n_ctxs": 244,
|
27 |
+
"ctx_len": 1024,
|
28 |
+
"refresh_batch_size": 4,
|
29 |
+
"out_batch_size": 2048,
|
30 |
+
"device": "cuda:0"
|
31 |
+
}
|
32 |
+
}
|
gemma-2-2b_batch_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_9/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 49.27125, "l1_loss": 898.26, "l0": 158.950009765625, "frac_variance_explained": 0.88154296875, "cossim": 0.93748046875, "l2_ratio": 0.93708984375, "relative_reconstruction_bias": 1.00015625, "loss_original": 2.152294921875, "loss_reconstructed": 2.206025390625, "loss_zero": 12.4375, "frac_recovered": 0.99396484375, "frac_alive": 0.9896240234375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b59deb52ed3542b238759a5c63a5d13efdada046e8be6fd08e77987cf0a9ccd
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 64.96625, "l1_loss": 280.28, "l0": 19.997119140625, "frac_variance_explained": 0.78849609375, "cossim": 0.88927734375, "l2_ratio": 0.889609375, "relative_reconstruction_bias": 1.0008203125, "loss_original": 2.152294921875, "loss_reconstructed": 2.403408203125, "loss_zero": 12.4375, "frac_recovered": 0.9754296875, "frac_alive": 0.949951171875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b0757ae9e45d28c0f15487b22b32bd47e8a2f3ebfb7f3ef2c6ed01dcf848de0
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 40,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_1",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_1/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 59.7275, "l1_loss": 382.27, "l0": 39.99611328125, "frac_variance_explained": 0.81900390625, "cossim": 0.90693359375, "l2_ratio": 0.90705078125, "relative_reconstruction_bias": 1.00095703125, "loss_original": 2.152294921875, "loss_reconstructed": 2.30181640625, "loss_zero": 12.4375, "frac_recovered": 0.985234375, "frac_alive": 0.98614501953125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:683749246f05b505533f29b04e9f04f4e2260dacb0c6e329422ed5a2934992f6
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 80,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_2",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_2/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 54.65125, "l1_loss": 529.98, "l0": 79.9989892578125, "frac_variance_explained": 0.84908203125, "cossim": 0.92232421875, "l2_ratio": 0.9225, "relative_reconstruction_bias": 1.00234375, "loss_original": 2.152294921875, "loss_reconstructed": 2.249189453125, "loss_zero": 12.4375, "frac_recovered": 0.99015625, "frac_alive": 0.9962158203125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:203a4f1acc8fc820a165a1245e7344959a603c0b2da8b9655b151c06ebba51d8
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 160,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_3",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_3/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 49.2875, "l1_loss": 773.14, "l0": 159.99974609375, "frac_variance_explained": 0.87669921875, "cossim": 0.9375, "l2_ratio": 0.93748046875, "relative_reconstruction_bias": 0.99919921875, "loss_original": 2.152294921875, "loss_reconstructed": 2.212333984375, "loss_zero": 12.4375, "frac_recovered": 0.9933984375, "frac_alive": 0.99652099609375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:384f7d4facce581a40eb20bf342fea7bef2a6e416b7b39532485155cd916bab3
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 320,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_4",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_4/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 42.815, "l1_loss": 1410.92, "l0": 319.9121923828125, "frac_variance_explained": 0.90875, "cossim": 0.953125, "l2_ratio": 0.953125, "relative_reconstruction_bias": 1.00017578125, "loss_original": 2.152294921875, "loss_reconstructed": 2.18783203125, "loss_zero": 12.4375, "frac_recovered": 0.99615234375, "frac_alive": 0.99505615234375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f6efe1d15269f95285203ba07c8b2872302e3494e98e57b3ba098883ff9df8a
|
3 |
+
size 302066710
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": 244140,
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 640,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_5",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12/trainer_5/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 34.06125, "l1_loss": 2503.6, "l0": 639.193583984375, "frac_variance_explained": 0.94197265625, "cossim": 0.97115234375, "l2_ratio": 0.9715625, "relative_reconstruction_bias": 1.0005859375, "loss_original": 2.152294921875, "loss_reconstructed": 2.167255859375, "loss_zero": 12.4375, "frac_recovered": 0.998359375, "frac_alive": 0.95263671875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_0/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:131924249230717015dfb802e9b890d7fec441de39c2d79d5b786d2571a8d689
|
3 |
+
size 302066858
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_0/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "0",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_244/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:715c667b68c8e965808597605943b7d4f28345c10f385bf7a12b469e3dd0d042
|
3 |
+
size 302066878
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_244/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "244",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_2441/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8e60b992aa98c678742d578f0790d9a8d3bfbec909a75497a8811da3f560581
|
3 |
+
size 302066952
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_2441/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "2441",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_24414/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b800c3e165fb8c2cf46b642132cb882a4700dccbbf51a5db26a9f9137945204
|
3 |
+
size 302067154
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_24414/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "24414",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_772/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3dd0e312fb5074492dec14c4f40c927633d9da963728f71b634e5b5d64a117d8
|
3 |
+
size 302066878
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_772/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "772",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_7720/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8571f0c90e6b64b8f0d72aadcc8c4d233c46d587ac1c4265534dafa606654b73
|
3 |
+
size 302066952
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_7720/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "7720",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_77203/ae.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7568f021895dbdb9f70072dfe678cf488797d0bc4ffbeeacc4d662c0f7b08f1d
|
3 |
+
size 302067154
|
gemma-2-2b_top_k_width-2pow14_date-0107/resid_post_layer_12_checkpoints/trainer_0_step_77203/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"trainer_class": "TopKTrainer",
|
4 |
+
"dict_class": "AutoEncoderTopK",
|
5 |
+
"lr": 0.0003,
|
6 |
+
"steps": "77203",
|
7 |
+
"auxk_alpha": 0.03125,
|
8 |
+
"warmup_steps": 1000,
|
9 |
+
"decay_start": 195312,
|
10 |
+
"threshold_beta": 0.999,
|
11 |
+
"threshold_start_step": 1000,
|
12 |
+
"seed": 0,
|
13 |
+
"activation_dim": 2304,
|
14 |
+
"dict_size": 16384,
|
15 |
+
"k": 20,
|
16 |
+
"device": "cuda:0",
|
17 |
+
"layer": 12,
|
18 |
+
"lm_name": "google/gemma-2-2b",
|
19 |
+
"wandb_name": "TopKTrainer-google/gemma-2-2b-resid_post_layer_12_trainer_0",
|
20 |
+
"submodule_name": "resid_post_layer_12"
|
21 |
+
},
|
22 |
+
"buffer": {
|
23 |
+
"d_submodule": 2304,
|
24 |
+
"io": "out",
|
25 |
+
"n_ctxs": 244,
|
26 |
+
"ctx_len": 1024,
|
27 |
+
"refresh_batch_size": 4,
|
28 |
+
"out_batch_size": 2048,
|
29 |
+
"device": "cuda:0"
|
30 |
+
}
|
31 |
+
}
|