SE-600M / config.yaml
abhinadduri's picture
Upload 6 files
a64e0a6 verified
dataset:
N: 512
P: 512
S: 512
cellxgene:
ds_type: h5ad
filter: false
num_datasets: 1139
train: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_cellxgene_train.csv
val: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_cellxgene_val.csv
cellxgene-tahoe:
ds_type: filtered_h5ad
filter: true
filter_by_species: null
num_datasets: 1139
train: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_tahoe_cellxgene_train_filtered.csv
val: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_tahoe_cellxgene_val_filtered.csv
chrom_token_right_idx: 2
cls_token_idx: 3
current: scbasecamp-cellxgene-tahoe-filtered
name: vci
num_cells: 36238464
num_train_workers: 32
num_val_workers: 8
overrides:
rpe1_top5000_variable: /large_storage/ctc/datasets/vci/validation/rpe1_top5000_variable.h5ad
pad_length: 2048
pad_token_idx: 0
scbasecamp-cellxgene-tahoe:
ds_type: filtered_h5ad
filter: true
filter_by_species: null
num_datasets: 15700
train: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_train.csv
val: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_val.csv
scbasecamp-cellxgene-tahoe-filtered:
ds_type: filtered_h5ad
filter: true
filter_by_species: null
num_datasets: 14420
train: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_train_filtered.csv
val: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_val_filtered.csv
seed: 42
embeddings:
current: esm2-cellxgene-basecamp-tahoe
esm2-cellxgene:
all_embeddings: /large_storage/ctc/userspace/aadduri/data/auxillary/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM2.pt
ds_emb_mapping: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_cellxgene_ds_mapping.torch
num: 19790
size: 5120
valid_genes_masks: null
esm2-cellxgene-basecamp-tahoe:
all_embeddings: /large_storage/ctc/userspace/aadduri/data/auxillary/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM2.pt
# ds_emb_mapping: /home/aadduri/vci_pretrain/gene_embidx_mapping_cross.torch
ds_emb_mapping: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_ds_mapping.torch
num: 19790
size: 5120
valid_genes_masks: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_valid_masks.torch
esm2-cellxgene-tahoe:
all_embeddings: /large_storage/ctc/userspace/aadduri/data/auxillary/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM2.pt
ds_emb_mapping: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_ds_mapping.torch
num: 19790
size: 5120
valid_genes_masks: /large_storage/ctc/userspace/aadduri/data/auxillary/esm_basecount_tahoe_cellxgene_valid_masks.torch
experiment:
checkpoint:
every_n_train_steps: 1000
monitor: trainer/train_loss
path: /data/checkpoints
save_top_k: 4
compiled: false
ddp_timeout: 3600
deaware: false
limit_val_batches: 100
local: local
name: vci_1.5.0_600M_basecount_tahoe_cxg
num_epochs: 16
num_gpus_per_node: 8
num_nodes: 2
port: 12400
profile:
enable_profiler: false
max_steps: 110
profile_steps:
- 10
- 100
val_check_interval: 1000
loss:
apply_normalization: false
kernel: energy
name: tabular
uniformity: false
model:
batch_size: 48
batch_tabular_loss: false
counts: true
d_hid: 2048
dataset_correction: true
dropout: 0.1
ema: false
ema_decay: 0.999
ema_update_interval: 1000
emsize: 2048
name: vci
nhead: 16
nlayers: 16
num_downsample: 1
output_dim: 2048
rda: true
sample_rda: false
use_flash_attention: true
variable_masking: true
optimizer:
end: 1.0
gradient_accumulation_steps: 8
max_grad_norm: 0.8
max_lr: 1.0e-05
reset_lr_on_restart: false
start: 0.01
weight_decay: 0.01
zclip: true
task:
mask: 0.2
tokenizer:
token_dim: 5120
validations:
diff_exp:
dataset: /large_storage/ctc/datasets/cellxgene/processed/rpe1_top5000_variable.h5ad
dataset_name: rpe1_top5000_variable
enable: false
eval_interval_multiple: 10
method: null
obs_filter_label: non-targeting
obs_pert_col: gene
top_k_rank: 200
perturbation:
ctrl_label: non-targeting
dataset: /large_storage/ctc/datasets/vci/validation/replogle_perturbation.h5ad
dataset_name: replogle_perturbation
enable: false
eval_interval_multiple: 10
pert_col: gene
wandb:
enable: true
project: vci