ekurtic commited on
Commit
43bce20
·
verified ·
1 Parent(s): ecbdad4

Add files using upload-large-folder tool

Browse files
Files changed (2) hide show
  1. tokenization_kimi.py +16 -6
  2. tokenizer_config.json +48 -8
tokenization_kimi.py CHANGED
@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
158
  def encode(
159
  self,
160
  text: str,
 
161
  **kwargs
162
  ) -> List[int]:
163
  """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
203
 
204
  t: List[int] = []
205
  for substr in all_substrs:
206
- t.extend(
207
- # we should consider special token as a common token
208
- self.model.encode(
209
- substr,
210
- disallowed_special=(),
 
 
 
 
 
 
 
 
 
 
211
  )
212
- )
213
 
214
  return t
215
 
 
158
  def encode(
159
  self,
160
  text: str,
161
+ allow_special_tokens: bool = True,
162
  **kwargs
163
  ) -> List[int]:
164
  """
 
204
 
205
  t: List[int] = []
206
  for substr in all_substrs:
207
+ if allow_special_tokens:
208
+ t.extend(
209
+ # we should consider special token as a common token
210
+ self.model.encode(
211
+ substr,
212
+ allowed_special="all",
213
+ )
214
+ )
215
+ else:
216
+ t.extend(
217
+ # we should consider special token as a common token
218
+ self.model.encode(
219
+ substr,
220
+ disallowed_special=(),
221
+ )
222
  )
 
223
 
224
  return t
225
 
tokenizer_config.json CHANGED
@@ -72,6 +72,46 @@
72
  "single_word": false,
73
  "special": true
74
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  "163601": {
76
  "content": "<|im_middle|>",
77
  "lstrip": false,
@@ -107,19 +147,19 @@
107
  "<|im_system|>",
108
  "<|im_middle|>"
109
  ],
110
- "auto_map": {
111
- "AutoTokenizer": [
112
- "tokenization_kimi.TikTokenTokenizer",
113
- null
114
- ]
115
- },
116
  "bos_token": "[BOS]",
117
- "chat_template": "{% if tools -%}\n {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n {{- tools | tojson -}}\n {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n {{ '<|im_system|>system<|im_middle|>' }}\n {%- elif message['role'] == 'user' -%}\n {{ '<|im_user|>user<|im_middle|>' }}\n {%- elif message['role'] == 'assistant' -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n {%- elif message['role'] == 'tool' -%}\n {{ '<|im_system|>tool<|im_middle|>' }}\n {%- endif -%}\n\n {%- if message['content'] is string -%}\n {{- message['content'] + '<|im_end|>' -}}\n {%- else -%}\n {%- for content in message['content'] -%}\n {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n {%- else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {{ '<|im_end|>' }}\n {%- endif -%}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}",
118
  "clean_up_tokenization_spaces": false,
119
  "eos_token": "[EOS]",
120
  "extra_special_tokens": {},
 
121
  "model_max_length": 1000000000000000019884624838656,
122
  "pad_token": "[PAD]",
123
  "tokenizer_class": "TikTokenTokenizer",
124
- "unk_token": "[UNK]"
 
 
 
 
 
 
125
  }
 
72
  "single_word": false,
73
  "special": true
74
  },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
  "163601": {
116
  "content": "<|im_middle|>",
117
  "lstrip": false,
 
147
  "<|im_system|>",
148
  "<|im_middle|>"
149
  ],
 
 
 
 
 
 
150
  "bos_token": "[BOS]",
 
151
  "clean_up_tokenization_spaces": false,
152
  "eos_token": "[EOS]",
153
  "extra_special_tokens": {},
154
+ "chat_template": "{%- if tools -%}\n <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>\n{%- endif -%}\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n <|im_system|>system<|im_middle|>\n {%- elif message['role'] == 'user' -%}\n <|im_user|>user<|im_middle|>\n {%- elif message['role'] == 'assistant' -%}\n <|im_assistant|>assistant<|im_middle|>\n {%- elif message['role'] == 'tool' -%}\n <|im_system|>tool<|im_middle|>\n {%- endif -%}\n {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}\n <|tool_calls_section_begin|>\n {%- for tool_call in message['tool_calls'] -%}\n {%- set func_name = tool_call['function']['name'] -%}\n {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>\n {%- endfor -%}\n <|tool_calls_section_end|>\n {%- elif message['role'] == 'tool' -%}\n ## Return of {{ message.tool_call_id }}\\n{{ message['content'] }}\n {%- elif message['content'] is string -%}\n {{ message['content'] }}\n {%- elif message['content'] is not none -%}\n {% for content in message['content'] -%}\n {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n <|media_start|>image<|media_content|><|media_pad|><|media_end|>\n {% else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n <|im_end|>\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n <|im_assistant|>assistant<|im_middle|>\n{%- endif -%}",
155
  "model_max_length": 1000000000000000019884624838656,
156
  "pad_token": "[PAD]",
157
  "tokenizer_class": "TikTokenTokenizer",
158
+ "unk_token": "[UNK]",
159
+ "auto_map": {
160
+ "AutoTokenizer": [
161
+ "tokenization_kimi.TikTokenTokenizer",
162
+ null
163
+ ]
164
+ }
165
  }