Text Generation
Transformers
Safetensors
English
ddllama
conversational
custom_code
xuan luo commited on
Commit
dcd46e7
·
verified ·
1 Parent(s): e7981ed

Upload configuration_ddllama.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_ddllama.py +97 -0
configuration_ddllama.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """DDLLaMA model configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.modeling_rope_utils import rope_config_validation
20
+
21
+ class DDLlamaConfig(PretrainedConfig):
22
+ model_type = "ddllama"
23
+ keys_to_ignore_at_inference = ["past_key_values"]
24
+ base_model_tp_plan = {
25
+ "layers.*.self_attn.q_proj": "colwise",
26
+ "layers.*.self_attn.k_proj": "colwise",
27
+ "layers.*.self_attn.v_proj": "colwise",
28
+ "layers.*.self_attn.o_proj": "rowwise",
29
+ "layers.*.mlp.gate_proj": "colwise",
30
+ "layers.*.mlp.up_proj": "colwise",
31
+ "layers.*.mlp.down_proj": "rowwise",
32
+ }
33
+
34
+ def __init__(
35
+ self,
36
+ vocab_size=128256,
37
+ hidden_size=4096,
38
+ intermediate_size=14336,
39
+ num_hidden_layers=32,
40
+ num_attention_heads=32,
41
+ num_key_value_heads=8,
42
+ router_layers=[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],
43
+ router_reduction_factor=16,
44
+ proj_reduction_factor=16,
45
+ hidden_act="silu",
46
+ max_position_embeddings=8192,
47
+ initializer_range=0.02,
48
+ rms_norm_eps=1e-5,
49
+ use_cache=True,
50
+ pad_token_id=None,
51
+ bos_token_id=128000,
52
+ eos_token_id=128009,
53
+ pretraining_tp=1,
54
+ tie_word_embeddings=False,
55
+ rope_theta=500000.0,
56
+ rope_scaling=None,
57
+ attention_bias=False,
58
+ attention_dropout=0.0,
59
+ mlp_bias=False,
60
+ head_dim=128,
61
+ torch_dtype="bfloat16",
62
+ **kwargs,
63
+ ):
64
+ self.vocab_size = vocab_size
65
+ self.hidden_size = hidden_size
66
+ self.intermediate_size = intermediate_size
67
+ self.num_hidden_layers = num_hidden_layers
68
+ self.num_attention_heads = num_attention_heads
69
+ self.num_key_value_heads = num_key_value_heads
70
+ self.router_layers = router_layers
71
+ self.router_reduction_factor = router_reduction_factor
72
+ self.proj_reduction_factor = proj_reduction_factor
73
+ self.hidden_act = hidden_act
74
+ self.max_position_embeddings = max_position_embeddings
75
+ self.initializer_range = initializer_range
76
+ self.rms_norm_eps = rms_norm_eps
77
+ self.use_cache = use_cache
78
+ self.pretraining_tp = pretraining_tp
79
+ self.rope_theta = rope_theta
80
+ self.rope_scaling = rope_scaling
81
+ self.attention_bias = attention_bias
82
+ self.attention_dropout = attention_dropout
83
+ self.mlp_bias = mlp_bias
84
+ self.head_dim = head_dim
85
+ self.torch_dtype = torch_dtype
86
+
87
+
88
+ # Validate the RoPE configuration
89
+ rope_config_validation(self)
90
+
91
+ super().__init__(
92
+ pad_token_id=pad_token_id,
93
+ bos_token_id=bos_token_id,
94
+ eos_token_id=eos_token_id,
95
+ tie_word_embeddings=tie_word_embeddings,
96
+ **kwargs,
97
+ )