Add files using upload-large-folder tool
Browse files- tokenization_kimi.py +16 -6
- tokenizer_config.json +48 -8
tokenization_kimi.py
CHANGED
@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
158 |
def encode(
|
159 |
self,
|
160 |
text: str,
|
|
|
161 |
**kwargs
|
162 |
) -> List[int]:
|
163 |
"""
|
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
203 |
|
204 |
t: List[int] = []
|
205 |
for substr in all_substrs:
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
)
|
212 |
-
)
|
213 |
|
214 |
return t
|
215 |
|
|
|
158 |
def encode(
|
159 |
self,
|
160 |
text: str,
|
161 |
+
allow_special_tokens: bool = True,
|
162 |
**kwargs
|
163 |
) -> List[int]:
|
164 |
"""
|
|
|
204 |
|
205 |
t: List[int] = []
|
206 |
for substr in all_substrs:
|
207 |
+
if allow_special_tokens:
|
208 |
+
t.extend(
|
209 |
+
# we should consider special token as a common token
|
210 |
+
self.model.encode(
|
211 |
+
substr,
|
212 |
+
allowed_special="all",
|
213 |
+
)
|
214 |
+
)
|
215 |
+
else:
|
216 |
+
t.extend(
|
217 |
+
# we should consider special token as a common token
|
218 |
+
self.model.encode(
|
219 |
+
substr,
|
220 |
+
disallowed_special=(),
|
221 |
+
)
|
222 |
)
|
|
|
223 |
|
224 |
return t
|
225 |
|
tokenizer_config.json
CHANGED
@@ -72,6 +72,46 @@
|
|
72 |
"single_word": false,
|
73 |
"special": true
|
74 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
"163601": {
|
76 |
"content": "<|im_middle|>",
|
77 |
"lstrip": false,
|
@@ -107,19 +147,19 @@
|
|
107 |
"<|im_system|>",
|
108 |
"<|im_middle|>"
|
109 |
],
|
110 |
-
"auto_map": {
|
111 |
-
"AutoTokenizer": [
|
112 |
-
"tokenization_kimi.TikTokenTokenizer",
|
113 |
-
null
|
114 |
-
]
|
115 |
-
},
|
116 |
"bos_token": "[BOS]",
|
117 |
-
"chat_template": "{% if tools -%}\n {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n {{- tools | tojson -}}\n {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n {{ '<|im_system|>system<|im_middle|>' }}\n {%- elif message['role'] == 'user' -%}\n {{ '<|im_user|>user<|im_middle|>' }}\n {%- elif message['role'] == 'assistant' -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n {%- elif message['role'] == 'tool' -%}\n {{ '<|im_system|>tool<|im_middle|>' }}\n {%- endif -%}\n\n {%- if message['content'] is string -%}\n {{- message['content'] + '<|im_end|>' -}}\n {%- else -%}\n {%- for content in message['content'] -%}\n {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n {%- else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {{ '<|im_end|>' }}\n {%- endif -%}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}",
|
118 |
"clean_up_tokenization_spaces": false,
|
119 |
"eos_token": "[EOS]",
|
120 |
"extra_special_tokens": {},
|
|
|
121 |
"model_max_length": 1000000000000000019884624838656,
|
122 |
"pad_token": "[PAD]",
|
123 |
"tokenizer_class": "TikTokenTokenizer",
|
124 |
-
"unk_token": "[UNK]"
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
}
|
|
|
72 |
"single_word": false,
|
73 |
"special": true
|
74 |
},
|
75 |
+
"163595": {
|
76 |
+
"content": "<|tool_calls_section_begin|>",
|
77 |
+
"lstrip": false,
|
78 |
+
"normalized": false,
|
79 |
+
"rstrip": false,
|
80 |
+
"single_word": false,
|
81 |
+
"special": false
|
82 |
+
},
|
83 |
+
"163596": {
|
84 |
+
"content": "<|tool_calls_section_end|>",
|
85 |
+
"lstrip": false,
|
86 |
+
"normalized": false,
|
87 |
+
"rstrip": false,
|
88 |
+
"single_word": false,
|
89 |
+
"special": false
|
90 |
+
},
|
91 |
+
"163597": {
|
92 |
+
"content": "<|tool_call_begin|>",
|
93 |
+
"lstrip": false,
|
94 |
+
"normalized": false,
|
95 |
+
"rstrip": false,
|
96 |
+
"single_word": false,
|
97 |
+
"special": false
|
98 |
+
},
|
99 |
+
"163598": {
|
100 |
+
"content": "<|tool_call_argument_begin|>",
|
101 |
+
"lstrip": false,
|
102 |
+
"normalized": false,
|
103 |
+
"rstrip": false,
|
104 |
+
"single_word": false,
|
105 |
+
"special": false
|
106 |
+
},
|
107 |
+
"163599": {
|
108 |
+
"content": "<|tool_call_end|>",
|
109 |
+
"lstrip": false,
|
110 |
+
"normalized": false,
|
111 |
+
"rstrip": false,
|
112 |
+
"single_word": false,
|
113 |
+
"special": false
|
114 |
+
},
|
115 |
"163601": {
|
116 |
"content": "<|im_middle|>",
|
117 |
"lstrip": false,
|
|
|
147 |
"<|im_system|>",
|
148 |
"<|im_middle|>"
|
149 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
"bos_token": "[BOS]",
|
|
|
151 |
"clean_up_tokenization_spaces": false,
|
152 |
"eos_token": "[EOS]",
|
153 |
"extra_special_tokens": {},
|
154 |
+
"chat_template": "{%- if tools -%}\n <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>\n{%- endif -%}\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n <|im_system|>system<|im_middle|>\n {%- elif message['role'] == 'user' -%}\n <|im_user|>user<|im_middle|>\n {%- elif message['role'] == 'assistant' -%}\n <|im_assistant|>assistant<|im_middle|>\n {%- elif message['role'] == 'tool' -%}\n <|im_system|>tool<|im_middle|>\n {%- endif -%}\n {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}\n <|tool_calls_section_begin|>\n {%- for tool_call in message['tool_calls'] -%}\n {%- set func_name = tool_call['function']['name'] -%}\n {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>\n {%- endfor -%}\n <|tool_calls_section_end|>\n {%- elif message['role'] == 'tool' -%}\n ## Return of {{ message.tool_call_id }}\\n{{ message['content'] }}\n {%- elif message['content'] is string -%}\n {{ message['content'] }}\n {%- elif message['content'] is not none -%}\n {% for content in message['content'] -%}\n {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n <|media_start|>image<|media_content|><|media_pad|><|media_end|>\n {% else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n <|im_end|>\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n <|im_assistant|>assistant<|im_middle|>\n{%- endif -%}",
|
155 |
"model_max_length": 1000000000000000019884624838656,
|
156 |
"pad_token": "[PAD]",
|
157 |
"tokenizer_class": "TikTokenTokenizer",
|
158 |
+
"unk_token": "[UNK]",
|
159 |
+
"auto_map": {
|
160 |
+
"AutoTokenizer": [
|
161 |
+
"tokenization_kimi.TikTokenTokenizer",
|
162 |
+
null
|
163 |
+
]
|
164 |
+
}
|
165 |
}
|