Upload 9 files
Browse files- README.md +29 -17
- config.json +44 -44
- model.safetensors +2 -2
README.md
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
---
|
2 |
license: cc-by-nc-sa-4.0
|
3 |
widget:
|
4 |
-
- text:
|
5 |
tags:
|
6 |
- DNA
|
7 |
- biology
|
8 |
- genomics
|
|
|
|
|
9 |
---
|
10 |
# Plant foundation DNA large language models
|
11 |
|
@@ -18,13 +20,11 @@ All the models have a comparable model size between 90 MB and 150 MB, BPE tokeni
|
|
18 |
### Model Sources
|
19 |
|
20 |
- **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
|
21 |
-
- **Manuscript:** [Versatile applications of foundation DNA
|
22 |
|
23 |
### Architecture
|
24 |
|
25 |
-
The model is trained based on the
|
26 |
-
|
27 |
-
This model is fine-tuned for predicting active core promoters.
|
28 |
|
29 |
### How to use
|
30 |
|
@@ -35,28 +35,40 @@ pip install transformers
|
|
35 |
|
36 |
Here is a simple code for inference:
|
37 |
```python
|
38 |
-
from transformers import
|
|
|
39 |
|
40 |
-
model_name = 'plant-nucleotide-transformer-6mer
|
41 |
# load model and tokenizer
|
42 |
-
model =
|
43 |
tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
|
44 |
|
45 |
-
#
|
46 |
-
sequences = ['
|
47 |
-
|
48 |
-
|
49 |
-
trust_remote_code=True, top_k=None)
|
50 |
-
results = pipe(sequences)
|
51 |
-
print(results)
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
```
|
54 |
|
55 |
|
56 |
### Training data
|
57 |
-
We use
|
58 |
Detailed training procedure can be found in our manuscript.
|
59 |
|
60 |
|
61 |
#### Hardware
|
62 |
-
Model was trained on a NVIDIA
|
|
|
1 |
---
|
2 |
license: cc-by-nc-sa-4.0
|
3 |
widget:
|
4 |
+
- text: AAAAGCGACATGACCAAACTGCCCCTCACCCGCCGCACTGATGACCGA
|
5 |
tags:
|
6 |
- DNA
|
7 |
- biology
|
8 |
- genomics
|
9 |
+
datasets:
|
10 |
+
- zhangtaolab/plant_reference_genomes
|
11 |
---
|
12 |
# Plant foundation DNA large language models
|
13 |
|
|
|
20 |
### Model Sources
|
21 |
|
22 |
- **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
|
23 |
+
- **Manuscript:** [Versatile applications of foundation DNA language models in plant genomes]()
|
24 |
|
25 |
### Architecture
|
26 |
|
27 |
+
The model is trained based on the Google Gemma model with modified config and tokenizer specific for DNA sequence.
|
|
|
|
|
28 |
|
29 |
### How to use
|
30 |
|
|
|
35 |
|
36 |
Here is a simple code for inference:
|
37 |
```python
|
38 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
39 |
+
import torch
|
40 |
|
41 |
+
model_name = 'plant-nucleotide-transformer-6mer'
|
42 |
# load model and tokenizer
|
43 |
+
model = AutoModelForCausalLM.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
|
44 |
tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
|
45 |
|
46 |
+
# example sequence and tokenization
|
47 |
+
sequences = ['ATATACGGCCGNC','GGGTATCGCTTCCGAC']
|
48 |
+
tokens = tokenizer(sequences,padding="longest")['input_ids']
|
49 |
+
print(f"Tokenzied sequence: {tokenizer.batch_decode(tokens)}")
|
|
|
|
|
|
|
50 |
|
51 |
+
# inference
|
52 |
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
53 |
+
model.to(device)
|
54 |
+
inputs = tokenizer(sequences, truncation=True, padding='max_length', max_length=512,
|
55 |
+
return_tensors="pt")
|
56 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
57 |
+
outs = model(
|
58 |
+
**inputs,
|
59 |
+
output_hidden_states=True
|
60 |
+
)
|
61 |
+
|
62 |
+
# get the final layer embeddings and prediction logits
|
63 |
+
embeddings = outs['hidden_states'][-1].detach().numpy()
|
64 |
+
logits = outs['logits'].detach().numpy()
|
65 |
```
|
66 |
|
67 |
|
68 |
### Training data
|
69 |
+
We use CausalLM method to pre-train the model, the tokenized sequence have a maximum length of 512.
|
70 |
Detailed training procedure can be found in our manuscript.
|
71 |
|
72 |
|
73 |
#### Hardware
|
74 |
+
Model was pre-trained on a NVIDIA RTX4090 GPU (24 GB).
|
config.json
CHANGED
@@ -1,45 +1,45 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "PlantDna_NT_6mer_plant-multi-species-core-promoters",
|
3 |
+
"add_bias_fnn": false,
|
4 |
+
"architectures": [
|
5 |
+
"EsmForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_probs_dropout_prob": 0.0,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "esm_config.EsmConfig",
|
10 |
+
"AutoModelForMaskedLM": "modeling_esm.EsmForMaskedLM",
|
11 |
+
"AutoModelForSequenceClassification": "modeling_esm.EsmForSequenceClassification",
|
12 |
+
"AutoModelForTokenClassification": "modeling_esm.EsmForTokenClassification"
|
13 |
+
},
|
14 |
+
"emb_layer_norm_before": false,
|
15 |
+
"esmfold_config": null,
|
16 |
+
"hidden_dropout_prob": 0.0,
|
17 |
+
"hidden_size": 512,
|
18 |
+
"id2label": {
|
19 |
+
"0": "Not_promoter",
|
20 |
+
"1": "promoter"
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 2048,
|
24 |
+
"is_folding_model": false,
|
25 |
+
"label2id": {
|
26 |
+
"Not_promoter": 0,
|
27 |
+
"promoter": 1
|
28 |
+
},
|
29 |
+
"layer_norm_eps": 1e-12,
|
30 |
+
"mask_token_id": 2,
|
31 |
+
"max_position_embeddings": 2050,
|
32 |
+
"model_type": "esm",
|
33 |
+
"num_attention_heads": 16,
|
34 |
+
"num_hidden_layers": 22,
|
35 |
+
"pad_token_id": 1,
|
36 |
+
"position_embedding_type": "rotary",
|
37 |
+
"problem_type": "single_label_classification",
|
38 |
+
"tie_word_embeddings": false,
|
39 |
+
"token_dropout": false,
|
40 |
+
"torch_dtype": "float32",
|
41 |
+
"transformers_version": "4.39.1",
|
42 |
+
"use_cache": false,
|
43 |
+
"vocab_list": null,
|
44 |
+
"vocab_size": 4107
|
45 |
+
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfa58fa873be62129d0957446590107f4f754b64b4e7a3bc96e1f1062756bb06
|
3 |
+
size 391597600
|