codebyzeb commited on
Commit
162d166
·
verified ·
1 Parent(s): 67cc952

Upload folder using huggingface_hub

Browse files
fw57Mmulti_Entropy_thresholdB_32000/tokenizer.json CHANGED
@@ -40,7 +40,10 @@
40
  ]
41
  },
42
  "pre_tokenizer": {
43
- "type": "WhitespaceSplit"
 
 
 
44
  },
45
  "post_processor": {
46
  "type": "ByteLevel",
@@ -318,262 +321,262 @@
318
  "Ł": 255,
319
  "ł": 256,
320
  "Ń": 257,
321
- "##Z": 258,
322
- "##s": 259,
323
- "##L": 260,
324
- "##à": 261,
325
- "##O": 262,
326
- "##t": 263,
327
- "##ĸ": 264,
328
- "##+": 265,
329
- "##Đ": 266,
330
- "##×": 267,
331
- "##,": 268,
332
- "##Á": 269,
333
- "##ě": 270,
334
- "##ķ": 271,
335
- "##K": 272,
336
- "##n": 273,
337
- "##đ": 274,
338
- "##İ": 275,
339
- "##¢": 276,
340
- "##Õ": 277,
341
- "##í": 278,
342
- "##Ą": 279,
343
- "##Ý": 280,
344
- "##¼": 281,
345
- "##Ģ": 282,
346
- "##¶": 283,
347
- "##X": 284,
348
- "##ā": 285,
349
- "##ē": 286,
350
- "##!": 287,
351
- "##¸": 288,
352
- "##Å": 289,
353
- "##H": 290,
354
- "##¾": 291,
355
- "##Į": 292,
356
- "##j": 293,
357
- "##ß": 294,
358
- "##Y": 295,
359
- "##Î": 296,
360
- "##Ł": 297,
361
- "##Ļ": 298,
362
- "##±": 299,
363
- "##R": 300,
364
- "##i": 301,
365
- "##\"": 302,
366
- "##;": 303,
367
- "##0": 304,
368
- "##o": 305,
369
- "##Ô": 306,
370
- "##ì": 307,
371
- "##ð": 308,
372
- "##?": 309,
373
- "##Č": 310,
374
- "##.": 311,
375
- "##ĝ": 312,
376
- "##*": 313,
377
- "##î": 314,
378
- "##Ğ": 315,
379
- "##Ġ": 316,
380
- "##A": 317,
381
- "##ij": 318,
382
- "##¥": 319,
383
- "##C": 320,
384
- "##Æ": 321,
385
- "##µ": 322,
386
- "##ë": 323,
387
- "##f": 324,
388
- "##«": 325,
389
- "##Q": 326,
390
- "##Ú": 327,
391
- "##Â": 328,
392
- "##ğ": 329,
393
- "##u": 330,
394
- "##J": 331,
395
- "##_": 332,
396
- "##ċ": 333,
397
- "##å": 334,
398
- "##{": 335,
399
- "##V": 336,
400
- "##Ħ": 337,
401
- "##E": 338,
402
- "##Ĉ": 339,
403
- "##ñ": 340,
404
- "##į": 341,
405
- "##7": 342,
406
- "##5": 343,
407
- "##Ķ": 344,
408
- "##Ò": 345,
409
- "##%": 346,
410
- "##Í": 347,
411
- "##q": 348,
412
- "##č": 349,
413
- "##2": 350,
414
- "##a": 351,
415
- "##ł": 352,
416
- "##²": 353,
417
- "##£": 354,
418
- "##>": 355,
419
- "##¿": 356,
420
- "##h": 357,
421
- "##Ĭ": 358,
422
- "##Ċ": 359,
423
- "##Þ": 360,
424
- "##û": 361,
425
- "##w": 362,
426
- "##F": 363,
427
- "##|": 364,
428
- "##Ó": 365,
429
- "##Ĕ": 366,
430
  "##6": 367,
431
- "##ď": 368,
432
- "##Ę": 369,
433
- "##d": 370,
434
- "##9": 371,
435
- "##/": 372,
436
- "##Ě": 373,
437
- "##-": 374,
438
- "##p": 375,
439
- "##ĵ": 376,
440
- "##ç": 377,
441
- "##b": 378,
442
- "##ļ": 379,
443
- "##´": 380,
444
- "##Ŀ": 381,
445
- "##ĉ": 382,
446
- "##º": 383,
447
- "##c": 384,
448
- "##È": 385,
449
- "##è": 386,
450
- "##Ê": 387,
451
- "##^": 388,
452
- "##ú": 389,
453
- "##â": 390,
454
- "##ò": 391,
455
- "##Ľ": 392,
456
- "##»": 393,
457
- "##Ø": 394,
458
- "##ö": 395,
459
- "##Ã": 396,
460
- "##á": 397,
461
- "##é": 398,
462
- "##ı": 399,
463
- "##É": 400,
464
- "##Û": 401,
465
- "##Ī": 402,
466
- "##§": 403,
467
- "##ø": 404,
468
- "##Ă": 405,
469
- "##ŀ": 406,
470
- "##Ĥ": 407,
471
- "##m": 408,
472
- "##ġ": 409,
473
- "##Ď": 410,
474
- "##`": 411,
475
- "##Ć": 412,
476
- "##)": 413,
477
- "##æ": 414,
478
- "##ħ": 415,
479
- "##Ä": 416,
480
- "###": 417,
481
- "##Ð": 418,
482
- "##ê": 419,
483
- "##x": 420,
484
- "##U": 421,
485
- "##ã": 422,
486
- "##z": 423,
487
- "##ė": 424,
488
- "##e": 425,
489
- "##¹": 426,
490
- "##Ė": 427,
491
- "##ĺ": 428,
492
- "##1": 429,
493
- "##Ù": 430,
494
- "##M": 431,
495
- "##Ĵ": 432,
496
- "##Ĝ": 433,
497
- "##½": 434,
498
- "##G": 435,
499
- "##<": 436,
500
- "##Ē": 437,
501
- "##õ": 438,
502
- "##T": 439,
503
- "##l": 440,
504
- "##&": 441,
505
- "##}": 442,
506
- "##D": 443,
507
- "##ä": 444,
508
- "##ý": 445,
509
- "##:": 446,
510
- "##©": 447,
511
- "##k": 448,
512
- "##¨": 449,
513
- "##8": 450,
514
- "##ô": 451,
515
- "##ĩ": 452,
516
- "##P": 453,
517
- "##ĭ": 454,
518
- "##®": 455,
519
- "##þ": 456,
520
- "##ľ": 457,
521
- "##ć": 458,
522
- "##I": 459,
523
- "##v": 460,
524
- "##4": 461,
525
- "##ª": 462,
526
- "##r": 463,
527
- "##·": 464,
528
- "##W": 465,
529
- "##[": 466,
530
- "##B": 467,
531
- "##Ü": 468,
532
- "##~": 469,
533
- "##3": 470,
534
- "##ÿ": 471,
535
- "##@": 472,
536
- "##ă": 473,
537
- "##Ĩ": 474,
538
- "##ģ": 475,
539
- "##¯": 476,
540
- "##Ń": 477,
541
- "##¦": 478,
542
- "##Ì": 479,
543
- "##Ç": 480,
544
- "##Ë": 481,
545
- "##ī": 482,
546
- "##ù": 483,
547
- "##÷": 484,
548
- "##³": 485,
549
- "##g": 486,
550
- "##ĕ": 487,
551
- "##ą": 488,
552
- "##Ö": 489,
553
- "##IJ": 490,
554
- "##¤": 491,
555
- "##ü": 492,
556
- "##À": 493,
557
- "##\\": 494,
558
- "##=": 495,
559
- "##ï": 496,
560
- "##ę": 497,
561
- "##y": 498,
562
- "##$": 499,
563
- "##N": 500,
564
- "##Ï": 501,
565
- "##¡": 502,
566
- "##Ñ": 503,
567
- "##°": 504,
568
- "##Ā": 505,
569
- "##¬": 506,
570
- "##ĥ": 507,
571
- "##(": 508,
572
- "##S": 509,
573
- "##]": 510,
574
- "##Ĺ": 511,
575
- "##ó": 512,
576
- "##'": 513,
577
  "<|unk|>": 514,
578
  "##�": 515,
579
  "sa": 516,
 
40
  ]
41
  },
42
  "pre_tokenizer": {
43
+ "type": "ByteLevel",
44
+ "add_prefix_space": true,
45
+ "trim_offsets": true,
46
+ "use_regex": true
47
  },
48
  "post_processor": {
49
  "type": "ByteLevel",
 
321
  "Ł": 255,
322
  "ł": 256,
323
  "Ń": 257,
324
+ "##)": 258,
325
+ "##ĝ": 259,
326
+ "##¶": 260,
327
+ "##|": 261,
328
+ "##Ē": 262,
329
+ "##Ě": 263,
330
+ "##`": 264,
331
+ "##ç": 265,
332
+ "##÷": 266,
333
+ "##p": 267,
334
+ "##=": 268,
335
+ "##2": 269,
336
+ "##U": 270,
337
+ "##s": 271,
338
+ "##¢": 272,
339
+ "##Í": 273,
340
+ "##Ê": 274,
341
+ "##ŀ": 275,
342
+ "##ú": 276,
343
+ "##û": 277,
344
+ "##ij": 278,
345
+ "##3": 279,
346
+ "##k": 280,
347
+ "##1": 281,
348
+ "##o": 282,
349
+ "##T": 283,
350
+ "##é": 284,
351
+ "##[": 285,
352
+ "##ċ": 286,
353
+ "##-": 287,
354
+ "##ħ": 288,
355
+ "##,": 289,
356
+ "##Ë": 290,
357
+ "##ĺ": 291,
358
+ "##I": 292,
359
+ "##đ": 293,
360
+ "##Ü": 294,
361
+ "##Ļ": 295,
362
+ "##ß": 296,
363
+ "##;": 297,
364
+ "##Ñ": 298,
365
+ "##R": 299,
366
+ "##@": 300,
367
+ "##\\": 301,
368
+ "##r": 302,
369
+ "##]": 303,
370
+ "##»": 304,
371
+ "##Ð": 305,
372
+ "##M": 306,
373
+ "##â": 307,
374
+ "##Ô": 308,
375
+ "##Õ": 309,
376
+ "##Đ": 310,
377
+ "##ę": 311,
378
+ "##ì": 312,
379
+ "##ð": 313,
380
+ "##¸": 314,
381
+ "##ı": 315,
382
+ "##_": 316,
383
+ "##4": 317,
384
+ "##ļ": 318,
385
+ "##Ğ": 319,
386
+ "##<": 320,
387
+ "##n": 321,
388
+ "##É": 322,
389
+ "##'": 323,
390
+ "##ò": 324,
391
+ "##0": 325,
392
+ "##C": 326,
393
+ "##ď": 327,
394
+ "##a": 328,
395
+ "##Ą": 329,
396
+ "##Ĭ": 330,
397
+ "##Æ": 331,
398
+ "##Ā": 332,
399
+ "##ģ": 333,
400
+ "##q": 334,
401
+ "##N": 335,
402
+ "##·": 336,
403
+ "##¬": 337,
404
+ "##Ý": 338,
405
+ "##½": 339,
406
+ "##Ď": 340,
407
+ "##ė": 341,
408
+ "##į": 342,
409
+ "##¤": 343,
410
+ "##Ĵ": 344,
411
+ "##Q": 345,
412
+ "##Ò": 346,
413
+ "##Ķ": 347,
414
+ "##L": 348,
415
+ "##ĭ": 349,
416
+ "##Ĉ": 350,
417
+ "##}": 351,
418
+ "##°": 352,
419
+ "##~": 353,
420
+ "##ĉ": 354,
421
+ "##^": 355,
422
+ "##!": 356,
423
+ "##ĕ": 357,
424
+ "##e": 358,
425
+ "##ä": 359,
426
+ "##þ": 360,
427
+ "##ö": 361,
428
+ "##Ć": 362,
429
+ "##Ċ": 363,
430
+ "##X": 364,
431
+ "##«": 365,
432
+ "##ľ": 366,
433
  "##6": 367,
434
+ "##l": 368,
435
+ "##à": 369,
436
+ "##Ń": 370,
437
+ "##Ú": 371,
438
+ "##5": 372,
439
+ "##Č": 373,
440
+ "##Ę": 374,
441
+ "##¡": 375,
442
+ "##W": 376,
443
+ "##ī": 377,
444
+ "##+": 378,
445
+ "##>": 379,
446
+ "##Ĝ": 380,
447
+ "##ł": 381,
448
+ "##®": 382,
449
+ "##%": 383,
450
+ "##:": 384,
451
+ "##&": 385,
452
+ "##Ç": 386,
453
+ "##ĸ": 387,
454
+ "##/": 388,
455
+ "##Ã": 389,
456
+ "##á": 390,
457
+ "##c": 391,
458
+ "##ē": 392,
459
+ "##Ï": 393,
460
+ "##¹": 394,
461
+ "##O": 395,
462
+ "##Ó": 396,
463
+ "##ñ": 397,
464
+ "##Ī": 398,
465
+ "##d": 399,
466
+ "##ù": 400,
467
+ "##Ă": 401,
468
+ "##æ": 402,
469
+ "##¾": 403,
470
+ "##ó": 404,
471
+ "##õ": 405,
472
+ "##ü": 406,
473
+ "##D": 407,
474
+ "##¯": 408,
475
+ "##Ġ": 409,
476
+ "##G": 410,
477
+ "##v": 411,
478
+ "##£": 412,
479
+ "##è": 413,
480
+ "##Ø": 414,
481
+ "##ý": 415,
482
+ "##ĩ": 416,
483
+ "##*": 417,
484
+ "##{": 418,
485
+ "##Â": 419,
486
+ "##h": 420,
487
+ "##w": 421,
488
+ "##¿": 422,
489
+ "##B": 423,
490
+ "##Á": 424,
491
+ "##t": 425,
492
+ "##ô": 426,
493
+ "###": 427,
494
+ "##?": 428,
495
+ "##¨": 429,
496
+ "##Ä": 430,
497
+ "##8": 431,
498
+ "##P": 432,
499
+ "##Û": 433,
500
+ "##ÿ": 434,
501
+ "##ā": 435,
502
+ "##Y": 436,
503
+ "##©": 437,
504
+ "##ě": 438,
505
+ "##Ì": 439,
506
+ "##ĥ": 440,
507
+ "##Ĥ": 441,
508
+ "##º": 442,
509
+ "##y": 443,
510
+ "##²": 444,
511
+ "##µ": 445,
512
+ "##E": 446,
513
+ "##¦": 447,
514
+ "##´": 448,
515
+ "##F": 449,
516
+ "##Ĕ": 450,
517
+ "##İ": 451,
518
+ "##Ľ": 452,
519
+ "##î": 453,
520
+ "##g": 454,
521
+ "##ª": 455,
522
+ "##¼": 456,
523
+ "##7": 457,
524
+ "##×": 458,
525
+ "##Å": 459,
526
+ "##b": 460,
527
+ "##§": 461,
528
+ "##ê": 462,
529
+ "##f": 463,
530
+ "##Ö": 464,
531
+ "##å": 465,
532
+ "##Z": 466,
533
+ "##ă": 467,
534
+ "##H": 468,
535
+ "##ġ": 469,
536
+ "##Ĩ": 470,
537
+ "##x": 471,
538
+ "##í": 472,
539
+ "##³": 473,
540
+ "##9": 474,
541
+ "##Þ": 475,
542
+ "##Ħ": 476,
543
+ "##J": 477,
544
+ "##Ł": 478,
545
+ "##IJ": 479,
546
+ "##ø": 480,
547
+ "##z": 481,
548
+ "##¥": 482,
549
+ "##j": 483,
550
+ "##È": 484,
551
+ "##č": 485,
552
+ "##ķ": 486,
553
+ "##V": 487,
554
+ "##Ù": 488,
555
+ "##S": 489,
556
+ "##$": 490,
557
+ "##ć": 491,
558
+ "##m": 492,
559
+ "##(": 493,
560
+ "##ï": 494,
561
+ "##Į": 495,
562
+ "##.": 496,
563
+ "##Ģ": 497,
564
+ "##i": 498,
565
+ "##Ŀ": 499,
566
+ "##Ĺ": 500,
567
+ "##A": 501,
568
+ "##±": 502,
569
+ "##ğ": 503,
570
+ "##Ė": 504,
571
+ "##u": 505,
572
+ "##Î": 506,
573
+ "##À": 507,
574
+ "##ë": 508,
575
+ "##ã": 509,
576
+ "##K": 510,
577
+ "##\"": 511,
578
+ "##ą": 512,
579
+ "##ĵ": 513,
580
  "<|unk|>": 514,
581
  "##�": 515,
582
  "sa": 516,
fw57Mmulti_Entropy_thresholdB_32000/vocab.json CHANGED
The diff for this file is too large to render. See raw diff