Zeb commited on
Commit
31e10af
·
1 Parent(s): 3f7ba71

Rename bytelevel

Browse files
bytelevel/tokenizer.json CHANGED
@@ -22,14 +22,7 @@
22
  "special": true
23
  }
24
  ],
25
- "normalizer": {
26
- "type": "Sequence",
27
- "normalizers": [
28
- {
29
- "type": "NFD"
30
- }
31
- ]
32
- },
33
  "pre_tokenizer": {
34
  "type": "ByteLevel",
35
  "add_prefix_space": true,
 
22
  "special": true
23
  }
24
  ],
25
+ "normalizer": null,
 
 
 
 
 
 
 
26
  "pre_tokenizer": {
27
  "type": "ByteLevel",
28
  "add_prefix_space": true,
bytelevel2/special_tokens_map.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "eos_token": "<|endoftext|>",
3
- "pad_token": "<|padding|>"
4
- }
 
 
 
 
 
bytelevel2/tokenizer.json DELETED
@@ -1,315 +0,0 @@
1
- {
2
- "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
5
- "added_tokens": [
6
- {
7
- "id": 0,
8
- "content": "<|padding|>",
9
- "single_word": false,
10
- "lstrip": false,
11
- "rstrip": false,
12
- "normalized": false,
13
- "special": true
14
- },
15
- {
16
- "id": 1,
17
- "content": "<|endoftext|>",
18
- "single_word": false,
19
- "lstrip": false,
20
- "rstrip": false,
21
- "normalized": false,
22
- "special": true
23
- }
24
- ],
25
- "normalizer": null,
26
- "pre_tokenizer": {
27
- "type": "ByteLevel",
28
- "add_prefix_space": true,
29
- "trim_offsets": true,
30
- "use_regex": true
31
- },
32
- "post_processor": {
33
- "type": "ByteLevel",
34
- "add_prefix_space": true,
35
- "trim_offsets": true,
36
- "use_regex": true
37
- },
38
- "decoder": {
39
- "type": "ByteLevel",
40
- "add_prefix_space": true,
41
- "trim_offsets": true,
42
- "use_regex": true
43
- },
44
- "model": {
45
- "type": "BPE",
46
- "dropout": null,
47
- "unk_token": null,
48
- "continuing_subword_prefix": null,
49
- "end_of_word_suffix": null,
50
- "fuse_unk": false,
51
- "byte_fallback": false,
52
- "ignore_merges": false,
53
- "vocab": {
54
- "<|padding|>": 0,
55
- "<|endoftext|>": 1,
56
- "!": 2,
57
- "\"": 3,
58
- "#": 4,
59
- "$": 5,
60
- "%": 6,
61
- "&": 7,
62
- "'": 8,
63
- "(": 9,
64
- ")": 10,
65
- "*": 11,
66
- "+": 12,
67
- ",": 13,
68
- "-": 14,
69
- ".": 15,
70
- "/": 16,
71
- "0": 17,
72
- "1": 18,
73
- "2": 19,
74
- "3": 20,
75
- "4": 21,
76
- "5": 22,
77
- "6": 23,
78
- "7": 24,
79
- "8": 25,
80
- "9": 26,
81
- ":": 27,
82
- ";": 28,
83
- "<": 29,
84
- "=": 30,
85
- ">": 31,
86
- "?": 32,
87
- "@": 33,
88
- "A": 34,
89
- "B": 35,
90
- "C": 36,
91
- "D": 37,
92
- "E": 38,
93
- "F": 39,
94
- "G": 40,
95
- "H": 41,
96
- "I": 42,
97
- "J": 43,
98
- "K": 44,
99
- "L": 45,
100
- "M": 46,
101
- "N": 47,
102
- "O": 48,
103
- "P": 49,
104
- "Q": 50,
105
- "R": 51,
106
- "S": 52,
107
- "T": 53,
108
- "U": 54,
109
- "V": 55,
110
- "W": 56,
111
- "X": 57,
112
- "Y": 58,
113
- "Z": 59,
114
- "[": 60,
115
- "\\": 61,
116
- "]": 62,
117
- "^": 63,
118
- "_": 64,
119
- "`": 65,
120
- "a": 66,
121
- "b": 67,
122
- "c": 68,
123
- "d": 69,
124
- "e": 70,
125
- "f": 71,
126
- "g": 72,
127
- "h": 73,
128
- "i": 74,
129
- "j": 75,
130
- "k": 76,
131
- "l": 77,
132
- "m": 78,
133
- "n": 79,
134
- "o": 80,
135
- "p": 81,
136
- "q": 82,
137
- "r": 83,
138
- "s": 84,
139
- "t": 85,
140
- "u": 86,
141
- "v": 87,
142
- "w": 88,
143
- "x": 89,
144
- "y": 90,
145
- "z": 91,
146
- "{": 92,
147
- "|": 93,
148
- "}": 94,
149
- "~": 95,
150
- "¡": 96,
151
- "¢": 97,
152
- "£": 98,
153
- "¤": 99,
154
- "¥": 100,
155
- "¦": 101,
156
- "§": 102,
157
- "¨": 103,
158
- "©": 104,
159
- "ª": 105,
160
- "«": 106,
161
- "¬": 107,
162
- "®": 108,
163
- "¯": 109,
164
- "°": 110,
165
- "±": 111,
166
- "²": 112,
167
- "³": 113,
168
- "´": 114,
169
- "µ": 115,
170
- "¶": 116,
171
- "·": 117,
172
- "¸": 118,
173
- "¹": 119,
174
- "º": 120,
175
- "»": 121,
176
- "¼": 122,
177
- "½": 123,
178
- "¾": 124,
179
- "¿": 125,
180
- "À": 126,
181
- "Á": 127,
182
- "Â": 128,
183
- "Ã": 129,
184
- "Ä": 130,
185
- "Å": 131,
186
- "Æ": 132,
187
- "Ç": 133,
188
- "È": 134,
189
- "É": 135,
190
- "Ê": 136,
191
- "Ë": 137,
192
- "Ì": 138,
193
- "Í": 139,
194
- "Î": 140,
195
- "Ï": 141,
196
- "Ð": 142,
197
- "Ñ": 143,
198
- "Ò": 144,
199
- "Ó": 145,
200
- "Ô": 146,
201
- "Õ": 147,
202
- "Ö": 148,
203
- "×": 149,
204
- "Ø": 150,
205
- "Ù": 151,
206
- "Ú": 152,
207
- "Û": 153,
208
- "Ü": 154,
209
- "Ý": 155,
210
- "Þ": 156,
211
- "ß": 157,
212
- "à": 158,
213
- "á": 159,
214
- "â": 160,
215
- "ã": 161,
216
- "ä": 162,
217
- "å": 163,
218
- "æ": 164,
219
- "ç": 165,
220
- "è": 166,
221
- "é": 167,
222
- "ê": 168,
223
- "ë": 169,
224
- "ì": 170,
225
- "í": 171,
226
- "î": 172,
227
- "ï": 173,
228
- "ð": 174,
229
- "ñ": 175,
230
- "ò": 176,
231
- "ó": 177,
232
- "ô": 178,
233
- "õ": 179,
234
- "ö": 180,
235
- "÷": 181,
236
- "ø": 182,
237
- "ù": 183,
238
- "ú": 184,
239
- "û": 185,
240
- "ü": 186,
241
- "ý": 187,
242
- "þ": 188,
243
- "ÿ": 189,
244
- "Ā": 190,
245
- "ā": 191,
246
- "Ă": 192,
247
- "ă": 193,
248
- "Ą": 194,
249
- "ą": 195,
250
- "Ć": 196,
251
- "ć": 197,
252
- "Ĉ": 198,
253
- "ĉ": 199,
254
- "Ċ": 200,
255
- "ċ": 201,
256
- "Č": 202,
257
- "č": 203,
258
- "Ď": 204,
259
- "ď": 205,
260
- "Đ": 206,
261
- "đ": 207,
262
- "Ē": 208,
263
- "ē": 209,
264
- "Ĕ": 210,
265
- "ĕ": 211,
266
- "Ė": 212,
267
- "ė": 213,
268
- "Ę": 214,
269
- "ę": 215,
270
- "Ě": 216,
271
- "ě": 217,
272
- "Ĝ": 218,
273
- "ĝ": 219,
274
- "Ğ": 220,
275
- "ğ": 221,
276
- "Ġ": 222,
277
- "ġ": 223,
278
- "Ģ": 224,
279
- "ģ": 225,
280
- "Ĥ": 226,
281
- "ĥ": 227,
282
- "Ħ": 228,
283
- "��": 229,
284
- "Ĩ": 230,
285
- "ĩ": 231,
286
- "Ī": 232,
287
- "ī": 233,
288
- "Ĭ": 234,
289
- "ĭ": 235,
290
- "Į": 236,
291
- "į": 237,
292
- "İ": 238,
293
- "ı": 239,
294
- "IJ": 240,
295
- "ij": 241,
296
- "Ĵ": 242,
297
- "ĵ": 243,
298
- "Ķ": 244,
299
- "ķ": 245,
300
- "ĸ": 246,
301
- "Ĺ": 247,
302
- "ĺ": 248,
303
- "Ļ": 249,
304
- "ļ": 250,
305
- "Ľ": 251,
306
- "ľ": 252,
307
- "Ŀ": 253,
308
- "ŀ": 254,
309
- "Ł": 255,
310
- "ł": 256,
311
- "Ń": 257
312
- },
313
- "merges": []
314
- }
315
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bytelevel2/tokenizer_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "add_prefix_space": true,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "<|padding|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "<|endoftext|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- }
20
- },
21
- "bos_token": null,
22
- "clean_up_tokenization_spaces": false,
23
- "eos_token": "<|endoftext|>",
24
- "extra_special_tokens": {},
25
- "model_max_length": 1000000000000000019884624838656,
26
- "pad_token": "<|padding|>",
27
- "tokenizer_class": "PreTrainedTokenizer",
28
- "unk_token": null
29
- }