Corianas commited on
Commit
efc5676
·
verified ·
1 Parent(s): 008a4b8

Upload 3 files

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +6 -0
  2. tokenizer.json +405 -0
  3. tokenizer_config.json +44 -0
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<bos>",
3
+ "eos_token": "<eos>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Sequence",
45
+ "normalizers": [
46
+ {
47
+ "type": "NFKC"
48
+ },
49
+ {
50
+ "type": "Replace",
51
+ "pattern": {
52
+ "Regex": "A"
53
+ },
54
+ "content": "↨a"
55
+ },
56
+ {
57
+ "type": "Replace",
58
+ "pattern": {
59
+ "Regex": "B"
60
+ },
61
+ "content": "↨b"
62
+ },
63
+ {
64
+ "type": "Replace",
65
+ "pattern": {
66
+ "Regex": "C"
67
+ },
68
+ "content": "↨c"
69
+ },
70
+ {
71
+ "type": "Replace",
72
+ "pattern": {
73
+ "Regex": "D"
74
+ },
75
+ "content": "↨d"
76
+ },
77
+ {
78
+ "type": "Replace",
79
+ "pattern": {
80
+ "Regex": "E"
81
+ },
82
+ "content": "↨e"
83
+ },
84
+ {
85
+ "type": "Replace",
86
+ "pattern": {
87
+ "Regex": "F"
88
+ },
89
+ "content": "↨f"
90
+ },
91
+ {
92
+ "type": "Replace",
93
+ "pattern": {
94
+ "Regex": "G"
95
+ },
96
+ "content": "↨g"
97
+ },
98
+ {
99
+ "type": "Replace",
100
+ "pattern": {
101
+ "Regex": "H"
102
+ },
103
+ "content": "↨h"
104
+ },
105
+ {
106
+ "type": "Replace",
107
+ "pattern": {
108
+ "Regex": "I"
109
+ },
110
+ "content": "↨i"
111
+ },
112
+ {
113
+ "type": "Replace",
114
+ "pattern": {
115
+ "Regex": "J"
116
+ },
117
+ "content": "↨j"
118
+ },
119
+ {
120
+ "type": "Replace",
121
+ "pattern": {
122
+ "Regex": "K"
123
+ },
124
+ "content": "↨k"
125
+ },
126
+ {
127
+ "type": "Replace",
128
+ "pattern": {
129
+ "Regex": "L"
130
+ },
131
+ "content": "↨l"
132
+ },
133
+ {
134
+ "type": "Replace",
135
+ "pattern": {
136
+ "Regex": "M"
137
+ },
138
+ "content": "↨m"
139
+ },
140
+ {
141
+ "type": "Replace",
142
+ "pattern": {
143
+ "Regex": "N"
144
+ },
145
+ "content": "↨n"
146
+ },
147
+ {
148
+ "type": "Replace",
149
+ "pattern": {
150
+ "Regex": "O"
151
+ },
152
+ "content": "↨o"
153
+ },
154
+ {
155
+ "type": "Replace",
156
+ "pattern": {
157
+ "Regex": "P"
158
+ },
159
+ "content": "↨p"
160
+ },
161
+ {
162
+ "type": "Replace",
163
+ "pattern": {
164
+ "Regex": "Q"
165
+ },
166
+ "content": "↨q"
167
+ },
168
+ {
169
+ "type": "Replace",
170
+ "pattern": {
171
+ "Regex": "R"
172
+ },
173
+ "content": "↨r"
174
+ },
175
+ {
176
+ "type": "Replace",
177
+ "pattern": {
178
+ "Regex": "S"
179
+ },
180
+ "content": "↨s"
181
+ },
182
+ {
183
+ "type": "Replace",
184
+ "pattern": {
185
+ "Regex": "T"
186
+ },
187
+ "content": "↨t"
188
+ },
189
+ {
190
+ "type": "Replace",
191
+ "pattern": {
192
+ "Regex": "U"
193
+ },
194
+ "content": "↨u"
195
+ },
196
+ {
197
+ "type": "Replace",
198
+ "pattern": {
199
+ "Regex": "V"
200
+ },
201
+ "content": "↨v"
202
+ },
203
+ {
204
+ "type": "Replace",
205
+ "pattern": {
206
+ "Regex": "W"
207
+ },
208
+ "content": "↨w"
209
+ },
210
+ {
211
+ "type": "Replace",
212
+ "pattern": {
213
+ "Regex": "X"
214
+ },
215
+ "content": "↨x"
216
+ },
217
+ {
218
+ "type": "Replace",
219
+ "pattern": {
220
+ "Regex": "Y"
221
+ },
222
+ "content": "↨y"
223
+ },
224
+ {
225
+ "type": "Replace",
226
+ "pattern": {
227
+ "Regex": "Z"
228
+ },
229
+ "content": "↨z"
230
+ }
231
+ ]
232
+ },
233
+ "pre_tokenizer": {
234
+ "type": "Split",
235
+ "pattern": {
236
+ "Regex": "\\X"
237
+ },
238
+ "behavior": "Isolated",
239
+ "invert": false
240
+ },
241
+ "post_processor": {
242
+ "type": "TemplateProcessing",
243
+ "single": [
244
+ {
245
+ "Sequence": {
246
+ "id": "A",
247
+ "type_id": 0
248
+ }
249
+ }
250
+ ],
251
+ "pair": [
252
+ {
253
+ "Sequence": {
254
+ "id": "A",
255
+ "type_id": 0
256
+ }
257
+ },
258
+ {
259
+ "Sequence": {
260
+ "id": "B",
261
+ "type_id": 0
262
+ }
263
+ }
264
+ ],
265
+ "special_tokens": {}
266
+ },
267
+ "decoder": {
268
+ "type": "Sequence",
269
+ "decoders": []
270
+ },
271
+ "model": {
272
+ "type": "WordLevel",
273
+ "vocab": {
274
+ "<pad>": 0,
275
+ "<unk>": 1,
276
+ "<bos>": 2,
277
+ "<eos>": 3,
278
+ "↨": 4,
279
+ "\n": 5,
280
+ "\t": 6,
281
+ " ": 7,
282
+ "0": 8,
283
+ "1": 9,
284
+ "2": 10,
285
+ "3": 11,
286
+ "4": 12,
287
+ "5": 13,
288
+ "6": 14,
289
+ "7": 15,
290
+ "8": 16,
291
+ "9": 17,
292
+ "a": 18,
293
+ "b": 19,
294
+ "c": 20,
295
+ "d": 21,
296
+ "e": 22,
297
+ "f": 23,
298
+ "g": 24,
299
+ "h": 25,
300
+ "i": 26,
301
+ "j": 27,
302
+ "k": 28,
303
+ "l": 29,
304
+ "m": 30,
305
+ "n": 31,
306
+ "o": 32,
307
+ "p": 33,
308
+ "q": 34,
309
+ "r": 35,
310
+ "s": 36,
311
+ "t": 37,
312
+ "u": 38,
313
+ "v": 39,
314
+ "w": 40,
315
+ "x": 41,
316
+ "y": 42,
317
+ "z": 43,
318
+ "\"": 44,
319
+ "!": 45,
320
+ "$": 46,
321
+ "&": 47,
322
+ "'": 48,
323
+ "#": 49,
324
+ ",": 50,
325
+ "/": 51,
326
+ "+": 52,
327
+ "=": 53,
328
+ "-": 54,
329
+ "<": 55,
330
+ ">": 56,
331
+ "*": 57,
332
+ "@": 58,
333
+ ".": 59,
334
+ ":": 60,
335
+ ";": 61,
336
+ "[": 62,
337
+ "]": 63,
338
+ "{": 64,
339
+ "}": 65,
340
+ "(": 66,
341
+ ")": 67,
342
+ "^": 68,
343
+ "_": 69,
344
+ "?": 70,
345
+ "è": 71,
346
+ "é": 72,
347
+ "¤69": 73,
348
+ "¤70": 74,
349
+ "¤71": 75,
350
+ "¤72": 76,
351
+ "¤73": 77,
352
+ "¤74": 78,
353
+ "¤75": 79,
354
+ "¤76": 80,
355
+ "¤77": 81,
356
+ "¤78": 82,
357
+ "¤79": 83,
358
+ "¤80": 84,
359
+ "¤81": 85,
360
+ "¤82": 86,
361
+ "¤83": 87,
362
+ "¤84": 88,
363
+ "¤85": 89,
364
+ "¤86": 90,
365
+ "¤87": 91,
366
+ "¤88": 92,
367
+ "¤89": 93,
368
+ "¤90": 94,
369
+ "¤91": 95,
370
+ "¤92": 96,
371
+ "¤93": 97,
372
+ "¤94": 98,
373
+ "¤95": 99,
374
+ "¤96": 100,
375
+ "¤97": 101,
376
+ "¤98": 102,
377
+ "¤99": 103,
378
+ "¤100": 104,
379
+ "¤101": 105,
380
+ "¤102": 106,
381
+ "¤103": 107,
382
+ "¤104": 108,
383
+ "¤105": 109,
384
+ "¤106": 110,
385
+ "¤107": 111,
386
+ "¤108": 112,
387
+ "¤109": 113,
388
+ "¤110": 114,
389
+ "¤111": 115,
390
+ "¤112": 116,
391
+ "¤113": 117,
392
+ "¤114": 118,
393
+ "¤115": 119,
394
+ "¤116": 120,
395
+ "¤117": 121,
396
+ "¤118": 122,
397
+ "¤119": 123,
398
+ "¤120": 124,
399
+ "¤121": 125,
400
+ "¤122": 126,
401
+ "¤123": 127
402
+ },
403
+ "unk_token": "<unk>"
404
+ }
405
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<bos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<eos>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<bos>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<eos>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "tokenizer_class": "PreTrainedTokenizer",
43
+ "unk_token": "<unk>"
44
+ }