Upload 9 files

Browse files

Files changed (3) hide show

README.md +29 -17
config.json +44 -44
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 license: cc-by-nc-sa-4.0
 widget:
-- text: AAAACATAATAATTTGCCGACTTACTCACCCTGTGATTAATCTATTTTCACTGTGTAGTAAGTAGAGAGTGTTACTTACTACAGTATCTATTTTTGTTTGGATGTTTGCCGTGGACAAGTGCTAACTGTCAAAACCCGTTTTGACCTTAAACCCAGCAATAATAATAATGTAAAACTCCATTGGGCAGTGCAACCTACTCCTCACATATTATATTATAATTCCTAAACCTTGATCAGTTAAATTAATAGCTCTGTTCCCTGTGGCTTTATATAAACACCATGGTTGTCAGCAGTTCAGCA
 tags:
 - DNA
 - biology
 - genomics
 ---
 # Plant foundation DNA large language models
@@ -18,13 +20,11 @@ All the models have a comparable model size between 90 MB and 150 MB, BPE tokeni
 ### Model Sources
 - **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
-- **Manuscript:** [Versatile applications of foundation DNA large language models in plant genomes]()
 ### Architecture
-The model is trained based on the InstaDeepAI/nucleotide-transformer-v2-100m-multi-species model with modified tokenizer that replaces k-mer to BPE.
-This model is fine-tuned for predicting active core promoters.
 ### How to use
@@ -35,28 +35,40 @@ pip install transformers
 Here is a simple code for inference:
 ```python
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-model_name = 'plant-nucleotide-transformer-6mer-promoter'
 # load model and tokenizer
-model = AutoModelForSequenceClassification.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
-# inference
-sequences = ['TTACTAAATTTATAACGATTTTTTATCTAACTTTAGCTCATCAATCTTTACCGTGTCAAAATTTAGTGCCAAGAAGCAGACATGGCCCGATGATCTTTTACCCTGTTTTCATAGCTCGCGAGCCGCGACCTGTGTCCAACCTCAACGGTCACTGCAGTCCCAGCACCTCAGCAGCCTGCGCCTGCCATACCCCCTCCCCCACCCACCCACACACACCATCCGGGCCCACGGTGGGACCCAGATGTCATGCGCTGTACGGGCGAGCAACTAGCCCCCACCTCTTCCCAAGAGGCAAAACCT',
-             'GACCTAATGATTAACCAAGGAAAAATGCAAGGATTTGACAAAAATATAGAAGCCAATGCTAGGCGCCTAAGTGAATGGATATGAAACAAAAAGCGAGCAGGCTGTCTATATATGGACAATTAGTTGCATTAATATAGTAGTTTATAATTGCAAGCATGGCACTACATCACAACACCTAAAAGACATGCCGTGATGCTAGAACAGCCATTGAATAAATTAGAAAGAAAGGTTGTGGTTAATTAGTTAACGACCAATCGAGCCTACTAGTATAAATTGTACCTCGTTGTTATGAAGTAATTC']
-pipe = pipeline('text-classification', model=model, tokenizer=tokenizer,
-                trust_remote_code=True, top_k=None)
-results = pipe(sequences)
-print(results)
 ```
 ### Training data
-We use EsmForSequenceClassification to fine-tune the model.
 Detailed training procedure can be found in our manuscript.
 #### Hardware
-Model was trained on a NVIDIA GTX1080Ti GPU (11 GB).

 ---
 license: cc-by-nc-sa-4.0
 widget:
+- text: AAAAGCGACATGACCAAACTGCCCCTCACCCGCCGCACTGATGACCGA
 tags:
 - DNA
 - biology
 - genomics
+datasets:
+- zhangtaolab/plant_reference_genomes
 ---
 # Plant foundation DNA large language models
 ### Model Sources
 - **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
+- **Manuscript:** [Versatile applications of foundation DNA language models in plant genomes]()
 ### Architecture
+The model is trained based on the Google Gemma model with modified config and tokenizer specific for DNA sequence.
 ### How to use
 Here is a simple code for inference:
 ```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+model_name = 'plant-nucleotide-transformer-6mer'
 # load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
+# example sequence and tokenization
+sequences = ['ATATACGGCCGNC','GGGTATCGCTTCCGAC']
+tokens = tokenizer(sequences,padding="longest")['input_ids']
+print(f"Tokenzied sequence: {tokenizer.batch_decode(tokens)}")
+# inference
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+model.to(device)
+inputs = tokenizer(sequences, truncation=True, padding='max_length', max_length=512,
+                   return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}
+outs = model(
+    **inputs,
+    output_hidden_states=True
+)
+# get the final layer embeddings and prediction logits
+embeddings = outs['hidden_states'][-1].detach().numpy()
+logits = outs['logits'].detach().numpy()
 ```
 ### Training data
+We use CausalLM method to pre-train the model, the tokenized sequence have a maximum length of 512.
 Detailed training procedure can be found in our manuscript.
 #### Hardware
+Model was pre-trained on a NVIDIA RTX4090 GPU (24 GB).

config.json CHANGED Viewed

@@ -1,45 +1,45 @@
 {
-  "_name_or_path": "../model/PlantDna_NT_6mer",
-  "add_bias_fnn": false,
-  "architectures": [
-    "EsmForSequenceClassification"
-  ],
-  "attention_probs_dropout_prob": 0.0,
-  "auto_map": {
-    "AutoConfig": "esm_config.EsmConfig",
-    "AutoModelForMaskedLM": "modeling_esm.EsmForMaskedLM",
-    "AutoModelForSequenceClassification": "modeling_esm.EsmForSequenceClassification",
-    "AutoModelForTokenClassification": "modeling_esm.EsmForTokenClassification"
-  },
-  "emb_layer_norm_before": false,
-  "esmfold_config": null,
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 512,
-  "id2label": {
-    "0": "Not_promoter",
-    "1": "promoter"
-  },
-  "initializer_range": 0.02,
-  "intermediate_size": 2048,
-  "is_folding_model": false,
-  "label2id": {
-    "Not_promoter": 0,
-    "promoter": 1
-  },
-  "layer_norm_eps": 1e-12,
-  "mask_token_id": 2,
-  "max_position_embeddings": 2050,
-  "model_type": "esm",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 22,
-  "pad_token_id": 1,
-  "position_embedding_type": "rotary",
-  "problem_type": "single_label_classification",
-  "tie_word_embeddings": false,
-  "token_dropout": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.42.4",
-  "use_cache": false,
-  "vocab_list": null,
-  "vocab_size": 4107
-}

 {
+    "_name_or_path": "PlantDna_NT_6mer_plant-multi-species-core-promoters",
+    "add_bias_fnn": false,
+    "architectures": [
+        "EsmForSequenceClassification"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "auto_map": {
+        "AutoConfig": "esm_config.EsmConfig",
+        "AutoModelForMaskedLM": "modeling_esm.EsmForMaskedLM",
+        "AutoModelForSequenceClassification": "modeling_esm.EsmForSequenceClassification",
+        "AutoModelForTokenClassification": "modeling_esm.EsmForTokenClassification"
+    },
+    "emb_layer_norm_before": false,
+    "esmfold_config": null,
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 512,
+    "id2label": {
+        "0": "Not_promoter",
+        "1": "promoter"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_folding_model": false,
+    "label2id": {
+        "Not_promoter": 0,
+        "promoter": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "mask_token_id": 2,
+    "max_position_embeddings": 2050,
+    "model_type": "esm",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 22,
+    "pad_token_id": 1,
+    "position_embedding_type": "rotary",
+    "problem_type": "single_label_classification",
+    "tie_word_embeddings": false,
+    "token_dropout": false,
+    "torch_dtype": "float32",
+    "transformers_version": "4.39.1",
+    "use_cache": false,
+    "vocab_list": null,
+    "vocab_size": 4107
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:295ddf2e2dc30603b9972b6c03ddf5b38842c6af9cf140e0109ed069823c6825
-size 383169868

 version https://git-lfs.github.com/spec/v1
+oid sha256:dfa58fa873be62129d0957446590107f4f754b64b4e7a3bc96e1f1062756bb06
+size 391597600