KeeeeepGoing commited on
Commit
14cd3f7
·
verified ·
1 Parent(s): 1ca65ff

Upload 9 files

Browse files
Files changed (3) hide show
  1. README.md +29 -17
  2. config.json +44 -44
  3. model.safetensors +2 -2
README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
  license: cc-by-nc-sa-4.0
3
  widget:
4
- - text: AAAACATAATAATTTGCCGACTTACTCACCCTGTGATTAATCTATTTTCACTGTGTAGTAAGTAGAGAGTGTTACTTACTACAGTATCTATTTTTGTTTGGATGTTTGCCGTGGACAAGTGCTAACTGTCAAAACCCGTTTTGACCTTAAACCCAGCAATAATAATAATGTAAAACTCCATTGGGCAGTGCAACCTACTCCTCACATATTATATTATAATTCCTAAACCTTGATCAGTTAAATTAATAGCTCTGTTCCCTGTGGCTTTATATAAACACCATGGTTGTCAGCAGTTCAGCA
5
  tags:
6
  - DNA
7
  - biology
8
  - genomics
 
 
9
  ---
10
  # Plant foundation DNA large language models
11
 
@@ -18,13 +20,11 @@ All the models have a comparable model size between 90 MB and 150 MB, BPE tokeni
18
  ### Model Sources
19
 
20
  - **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
21
- - **Manuscript:** [Versatile applications of foundation DNA large language models in plant genomes]()
22
 
23
  ### Architecture
24
 
25
- The model is trained based on the InstaDeepAI/nucleotide-transformer-v2-100m-multi-species model with modified tokenizer that replaces k-mer to BPE.
26
-
27
- This model is fine-tuned for predicting active core promoters.
28
 
29
  ### How to use
30
 
@@ -35,28 +35,40 @@ pip install transformers
35
 
36
  Here is a simple code for inference:
37
  ```python
38
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 
39
 
40
- model_name = 'plant-nucleotide-transformer-6mer-promoter'
41
  # load model and tokenizer
42
- model = AutoModelForSequenceClassification.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
43
  tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
44
 
45
- # inference
46
- sequences = ['TTACTAAATTTATAACGATTTTTTATCTAACTTTAGCTCATCAATCTTTACCGTGTCAAAATTTAGTGCCAAGAAGCAGACATGGCCCGATGATCTTTTACCCTGTTTTCATAGCTCGCGAGCCGCGACCTGTGTCCAACCTCAACGGTCACTGCAGTCCCAGCACCTCAGCAGCCTGCGCCTGCCATACCCCCTCCCCCACCCACCCACACACACCATCCGGGCCCACGGTGGGACCCAGATGTCATGCGCTGTACGGGCGAGCAACTAGCCCCCACCTCTTCCCAAGAGGCAAAACCT',
47
- 'GACCTAATGATTAACCAAGGAAAAATGCAAGGATTTGACAAAAATATAGAAGCCAATGCTAGGCGCCTAAGTGAATGGATATGAAACAAAAAGCGAGCAGGCTGTCTATATATGGACAATTAGTTGCATTAATATAGTAGTTTATAATTGCAAGCATGGCACTACATCACAACACCTAAAAGACATGCCGTGATGCTAGAACAGCCATTGAATAAATTAGAAAGAAAGGTTGTGGTTAATTAGTTAACGACCAATCGAGCCTACTAGTATAAATTGTACCTCGTTGTTATGAAGTAATTC']
48
- pipe = pipeline('text-classification', model=model, tokenizer=tokenizer,
49
- trust_remote_code=True, top_k=None)
50
- results = pipe(sequences)
51
- print(results)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  ```
54
 
55
 
56
  ### Training data
57
- We use EsmForSequenceClassification to fine-tune the model.
58
  Detailed training procedure can be found in our manuscript.
59
 
60
 
61
  #### Hardware
62
- Model was trained on a NVIDIA GTX1080Ti GPU (11 GB).
 
1
  ---
2
  license: cc-by-nc-sa-4.0
3
  widget:
4
+ - text: AAAAGCGACATGACCAAACTGCCCCTCACCCGCCGCACTGATGACCGA
5
  tags:
6
  - DNA
7
  - biology
8
  - genomics
9
+ datasets:
10
+ - zhangtaolab/plant_reference_genomes
11
  ---
12
  # Plant foundation DNA large language models
13
 
 
20
  ### Model Sources
21
 
22
  - **Repository:** [Plant DNA LLMs](https://github.com/zhangtaolab/plant_DNA_LLMs)
23
+ - **Manuscript:** [Versatile applications of foundation DNA language models in plant genomes]()
24
 
25
  ### Architecture
26
 
27
+ The model is trained based on the Google Gemma model with modified config and tokenizer specific for DNA sequence.
 
 
28
 
29
  ### How to use
30
 
 
35
 
36
  Here is a simple code for inference:
37
  ```python
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer
39
+ import torch
40
 
41
+ model_name = 'plant-nucleotide-transformer-6mer'
42
  # load model and tokenizer
43
+ model = AutoModelForCausalLM.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
44
  tokenizer = AutoTokenizer.from_pretrained(f'zhangtaolab/{model_name}', trust_remote_code=True)
45
 
46
+ # example sequence and tokenization
47
+ sequences = ['ATATACGGCCGNC','GGGTATCGCTTCCGAC']
48
+ tokens = tokenizer(sequences,padding="longest")['input_ids']
49
+ print(f"Tokenzied sequence: {tokenizer.batch_decode(tokens)}")
 
 
 
50
 
51
+ # inference
52
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
53
+ model.to(device)
54
+ inputs = tokenizer(sequences, truncation=True, padding='max_length', max_length=512,
55
+ return_tensors="pt")
56
+ inputs = {k: v.to(device) for k, v in inputs.items()}
57
+ outs = model(
58
+ **inputs,
59
+ output_hidden_states=True
60
+ )
61
+
62
+ # get the final layer embeddings and prediction logits
63
+ embeddings = outs['hidden_states'][-1].detach().numpy()
64
+ logits = outs['logits'].detach().numpy()
65
  ```
66
 
67
 
68
  ### Training data
69
+ We use CausalLM method to pre-train the model, the tokenized sequence have a maximum length of 512.
70
  Detailed training procedure can be found in our manuscript.
71
 
72
 
73
  #### Hardware
74
+ Model was pre-trained on a NVIDIA RTX4090 GPU (24 GB).
config.json CHANGED
@@ -1,45 +1,45 @@
1
  {
2
- "_name_or_path": "../model/PlantDna_NT_6mer",
3
- "add_bias_fnn": false,
4
- "architectures": [
5
- "EsmForSequenceClassification"
6
- ],
7
- "attention_probs_dropout_prob": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "esm_config.EsmConfig",
10
- "AutoModelForMaskedLM": "modeling_esm.EsmForMaskedLM",
11
- "AutoModelForSequenceClassification": "modeling_esm.EsmForSequenceClassification",
12
- "AutoModelForTokenClassification": "modeling_esm.EsmForTokenClassification"
13
- },
14
- "emb_layer_norm_before": false,
15
- "esmfold_config": null,
16
- "hidden_dropout_prob": 0.0,
17
- "hidden_size": 512,
18
- "id2label": {
19
- "0": "Not_promoter",
20
- "1": "promoter"
21
- },
22
- "initializer_range": 0.02,
23
- "intermediate_size": 2048,
24
- "is_folding_model": false,
25
- "label2id": {
26
- "Not_promoter": 0,
27
- "promoter": 1
28
- },
29
- "layer_norm_eps": 1e-12,
30
- "mask_token_id": 2,
31
- "max_position_embeddings": 2050,
32
- "model_type": "esm",
33
- "num_attention_heads": 16,
34
- "num_hidden_layers": 22,
35
- "pad_token_id": 1,
36
- "position_embedding_type": "rotary",
37
- "problem_type": "single_label_classification",
38
- "tie_word_embeddings": false,
39
- "token_dropout": false,
40
- "torch_dtype": "float32",
41
- "transformers_version": "4.42.4",
42
- "use_cache": false,
43
- "vocab_list": null,
44
- "vocab_size": 4107
45
- }
 
1
  {
2
+ "_name_or_path": "PlantDna_NT_6mer_plant-multi-species-core-promoters",
3
+ "add_bias_fnn": false,
4
+ "architectures": [
5
+ "EsmForSequenceClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "esm_config.EsmConfig",
10
+ "AutoModelForMaskedLM": "modeling_esm.EsmForMaskedLM",
11
+ "AutoModelForSequenceClassification": "modeling_esm.EsmForSequenceClassification",
12
+ "AutoModelForTokenClassification": "modeling_esm.EsmForTokenClassification"
13
+ },
14
+ "emb_layer_norm_before": false,
15
+ "esmfold_config": null,
16
+ "hidden_dropout_prob": 0.0,
17
+ "hidden_size": 512,
18
+ "id2label": {
19
+ "0": "Not_promoter",
20
+ "1": "promoter"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 2048,
24
+ "is_folding_model": false,
25
+ "label2id": {
26
+ "Not_promoter": 0,
27
+ "promoter": 1
28
+ },
29
+ "layer_norm_eps": 1e-12,
30
+ "mask_token_id": 2,
31
+ "max_position_embeddings": 2050,
32
+ "model_type": "esm",
33
+ "num_attention_heads": 16,
34
+ "num_hidden_layers": 22,
35
+ "pad_token_id": 1,
36
+ "position_embedding_type": "rotary",
37
+ "problem_type": "single_label_classification",
38
+ "tie_word_embeddings": false,
39
+ "token_dropout": false,
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.39.1",
42
+ "use_cache": false,
43
+ "vocab_list": null,
44
+ "vocab_size": 4107
45
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:295ddf2e2dc30603b9972b6c03ddf5b38842c6af9cf140e0109ed069823c6825
3
- size 383169868
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfa58fa873be62129d0957446590107f4f754b64b4e7a3bc96e1f1062756bb06
3
+ size 391597600