codebyzeb commited on
Commit
47b2f9e
·
verified ·
1 Parent(s): 0e739a7

Upload folder using huggingface_hub

Browse files
fw57M_Surprisal_thresholdB_8064/tokenizer.json CHANGED
@@ -40,7 +40,10 @@
40
  ]
41
  },
42
  "pre_tokenizer": {
43
- "type": "WhitespaceSplit"
 
 
 
44
  },
45
  "post_processor": {
46
  "type": "ByteLevel",
@@ -318,262 +321,262 @@
318
  "Ł": 255,
319
  "ł": 256,
320
  "Ń": 257,
321
- "##ĉ": 258,
322
  "##z": 259,
323
- "##Ó": 260,
324
- "##,": 261,
325
- "##Ý": 262,
326
- "##M": 263,
327
- "##i": 264,
328
- "##a": 265,
329
- "##s": 266,
330
- "##Y": 267,
331
- "##|": 268,
332
- "##R": 269,
333
- "##d": 270,
334
- "##j": 271,
335
- "##°": 272,
336
- "##³": 273,
337
- "##Î": 274,
338
- "##ď": 275,
339
- "##;": 276,
340
- "##<": 277,
341
- "##ā": 278,
342
- "##'": 279,
343
- "##Ĺ": 280,
344
- "##Ç": 281,
345
- "##%": 282,
346
- "##Ď": 283,
347
- "##]": 284,
348
- "##å": 285,
349
- "##ĸ": 286,
350
- "##ĵ": 287,
351
- "##w": 288,
352
- "##¿": 289,
353
- "##È": 290,
354
- "##Ē": 291,
355
- "##E": 292,
356
- "##I": 293,
357
- "##ğ": 294,
358
- "##ê": 295,
359
- "##Ľ": 296,
360
- "##$": 297,
361
- "##ĕ": 298,
362
- "##ě": 299,
363
- "##P": 300,
364
- "##Þ": 301,
365
- "##J": 302,
366
- "##Č": 303,
367
- "##¸": 304,
368
- "##m": 305,
369
- "##č": 306,
370
- "##¦": 307,
371
- "##Ø": 308,
372
- "##Á": 309,
373
- "##`": 310,
374
- "##À": 311,
375
- "##@": 312,
376
- "##/": 313,
377
- "##4": 314,
378
- "##Û": 315,
379
- "##Ñ": 316,
380
- "##Ĥ": 317,
381
- "##Â": 318,
382
- "##Q": 319,
383
- "##Ğ": 320,
384
- "##~": 321,
385
- "##×": 322,
386
- "##©": 323,
387
- "##è": 324,
388
- "##ù": 325,
389
- "##q": 326,
390
- "##c": 327,
391
- "##h": 328,
392
- "##ú": 329,
393
- "##Ġ": 330,
394
- "##į": 331,
395
- "##Ð": 332,
396
- "##ã": 333,
397
- "##.": 334,
398
- "##[": 335,
399
- "##k": 336,
400
- "##}": 337,
401
- "##Ė": 338,
402
- "##Į": 339,
403
- "##G": 340,
404
- "##Ĭ": 341,
405
- "##3": 342,
406
- "##+": 343,
407
- "##ķ": 344,
408
- "##Ł": 345,
409
- "##£": 346,
410
- "##Ĝ": 347,
411
- "##Ą": 348,
412
- "##ė": 349,
413
- "##K": 350,
414
- "##_": 351,
415
- "##õ": 352,
416
- "##N": 353,
417
- "##ï": 354,
418
- "##7": 355,
419
- "##ä": 356,
420
- "##²": 357,
421
- "##ă": 358,
422
- "##í": 359,
423
- "##ü": 360,
424
- "##ľ": 361,
425
- "##*": 362,
426
- "##\"": 363,
427
- "##f": 364,
428
- "##ë": 365,
429
- "##ī": 366,
430
- "##IJ": 367,
431
- "##¡": 368,
432
- "##Ê": 369,
433
- "##þ": 370,
434
- "##8": 371,
435
- "##>": 372,
436
- "##Ċ": 373,
437
- "##Ô": 374,
438
- "##ø": 375,
439
- "##n": 376,
440
- "##İ": 377,
441
- "##Ă": 378,
442
- "##¯": 379,
443
- "##ć": 380,
444
- "##·": 381,
445
- "##Ģ": 382,
446
- "##Ĉ": 383,
447
- "##ĝ": 384,
448
- "##ħ": 385,
449
- "##b": 386,
450
- "##É": 387,
451
- "##H": 388,
452
- "##9": 389,
453
- "##û": 390,
454
- "##l": 391,
455
- "##-": 392,
456
- "##r": 393,
457
- "##ó": 394,
458
- "##ę": 395,
459
- "##D": 396,
460
- "##ª": 397,
461
- "##ð": 398,
462
- "##¬": 399,
463
- "##v": 400,
464
- "##5": 401,
465
- "##Ì": 402,
466
- "##Z": 403,
467
- "##æ": 404,
468
- "##Ù": 405,
469
- "##ô": 406,
470
- "##ł": 407,
471
- "##e": 408,
472
- "##S": 409,
473
- "##Ļ": 410,
474
- "##L": 411,
475
- "##à": 412,
476
- "##Ò": 413,
477
- "##ġ": 414,
478
- "##:": 415,
479
- "##Ï": 416,
480
- "##ą": 417,
481
- "##Ë": 418,
482
- "##Ĵ": 419,
483
- "##g": 420,
484
- "##Ć": 421,
485
- "##ı": 422,
486
- "##6": 423,
487
- "##B": 424,
488
- "##p": 425,
489
- "###": 426,
490
- "##¨": 427,
491
- "##Ĩ": 428,
492
- "##O": 429,
493
- "##¹": 430,
494
- "##¼": 431,
495
- "##Ě": 432,
496
- "##y": 433,
497
- "##Ã": 434,
498
- "##º": 435,
499
- "##Ü": 436,
500
- "##«": 437,
501
- "##ĭ": 438,
502
- "##ö": 439,
503
- "##=": 440,
504
- "##A": 441,
505
- "##§": 442,
506
- "##¾": 443,
507
- "##ļ": 444,
508
- "##±": 445,
509
- "##Ę": 446,
510
- "##Ķ": 447,
511
- "##Ŀ": 448,
512
- "##ß": 449,
513
- "##X": 450,
514
- "##ñ": 451,
515
- "##ò": 452,
516
- "##t": 453,
517
- "##Ī": 454,
518
- "##÷": 455,
519
- "##ĺ": 456,
520
- "##Í": 457,
521
- "##T": 458,
522
- "##Ö": 459,
523
- "##o": 460,
524
- "##1": 461,
525
- "##½": 462,
526
- "##)": 463,
527
- "##»": 464,
528
- "##á": 465,
529
- "##¶": 466,
530
- "##ý": 467,
531
- "##ĩ": 468,
532
- "##U": 469,
533
- "##Ä": 470,
534
- "##W": 471,
535
- "##2": 472,
536
- "##Ħ": 473,
537
- "##¥": 474,
538
- "##!": 475,
539
- "##ċ": 476,
540
- "##ij": 477,
541
- "##\\": 478,
542
- "##µ": 479,
543
- "##ì": 480,
544
- "##&": 481,
545
- "##ÿ": 482,
546
- "##ĥ": 483,
547
- "##F": 484,
548
- "##C": 485,
549
- "##Æ": 486,
550
- "##đ": 487,
551
- "##®": 488,
552
- "##Å": 489,
553
- "##V": 490,
554
- "##Ú": 491,
555
- "##â": 492,
556
- "##?": 493,
557
- "##(": 494,
558
- "##0": 495,
559
- "##x": 496,
560
- "##ģ": 497,
561
- "##^": 498,
562
- "##î": 499,
563
- "##Đ": 500,
564
- "##Ĕ": 501,
565
- "##Ń": 502,
566
- "##ŀ": 503,
567
- "##ç": 504,
568
- "##é": 505,
569
- "##u": 506,
570
- "##ē": 507,
571
- "##´": 508,
572
- "##Ā": 509,
573
- "##¤": 510,
574
- "##Õ": 511,
575
- "##{": 512,
576
- "##¢": 513,
577
  "<|unk|>": 514,
578
  "##ng": 515,
579
  "##�": 516,
 
40
  ]
41
  },
42
  "pre_tokenizer": {
43
+ "type": "ByteLevel",
44
+ "add_prefix_space": true,
45
+ "trim_offsets": true,
46
+ "use_regex": true
47
  },
48
  "post_processor": {
49
  "type": "ByteLevel",
 
321
  "Ł": 255,
322
  "ł": 256,
323
  "Ń": 257,
324
+ "##>": 258,
325
  "##z": 259,
326
+ "##Ï": 260,
327
+ "##ú": 261,
328
+ "##M": 262,
329
+ "##^": 263,
330
+ "##¢": 264,
331
+ "##T": 265,
332
+ "##.": 266,
333
+ "##6": 267,
334
+ "##:": 268,
335
+ "##Z": 269,
336
+ "##¯": 270,
337
+ "##È": 271,
338
+ "##q": 272,
339
+ "##ç": 273,
340
+ "##ò": 274,
341
+ "##9": 275,
342
+ "##ì": 276,
343
+ "##ě": 277,
344
+ "##k": 278,
345
+ "##u": 279,
346
+ "##$": 280,
347
+ "##ĥ": 281,
348
+ "##ē": 282,
349
+ "##-": 283,
350
+ "##Å": 284,
351
+ "##G": 285,
352
+ "##b": 286,
353
+ "##Ð": 287,
354
+ "##ğ": 288,
355
+ "##Â": 289,
356
+ "##č": 290,
357
+ "##÷": 291,
358
+ "##`": 292,
359
+ "##ĩ": 293,
360
+ "##Ù": 294,
361
+ "##ď": 295,
362
+ "##ä": 296,
363
+ "##¸": 297,
364
+ "##ê": 298,
365
+ "##¾": 299,
366
+ "##X": 300,
367
+ "##Ğ": 301,
368
+ "##+": 302,
369
+ "##Ü": 303,
370
+ "##ą": 304,
371
+ "##g": 305,
372
+ "##À": 306,
373
+ "##Ĉ": 307,
374
+ "##&": 308,
375
+ "##h": 309,
376
+ "##ñ": 310,
377
+ "##İ": 311,
378
+ "##%": 312,
379
+ "##ã": 313,
380
+ "##Í": 314,
381
+ "##Ď": 315,
382
+ "##H": 316,
383
+ "##ª": 317,
384
+ "##Ø": 318,
385
+ "##7": 319,
386
+ "##®": 320,
387
+ "##´": 321,
388
+ "##õ": 322,
389
+ "##¦": 323,
390
+ "##ı": 324,
391
+ "##¬": 325,
392
+ "##E": 326,
393
+ "##Ñ": 327,
394
+ "##3": 328,
395
+ "##é": 329,
396
+ "##ï": 330,
397
+ "##Ć": 331,
398
+ "##ħ": 332,
399
+ "##_": 333,
400
+ "##×": 334,
401
+ "##á": 335,
402
+ "##Ę": 336,
403
+ "##ļ": 337,
404
+ "##¿": 338,
405
+ "##¶": 339,
406
+ "##ă": 340,
407
+ "##ð": 341,
408
+ "##±": 342,
409
+ "##Ľ": 343,
410
+ "##R": 344,
411
+ "##Ö": 345,
412
+ "##Ĵ": 346,
413
+ "##ü": 347,
414
+ "##U": 348,
415
+ "##]": 349,
416
+ "##8": 350,
417
+ "##í": 351,
418
+ "##I": 352,
419
+ "##ø": 353,
420
+ "##F": 354,
421
+ "##æ": 355,
422
+ "##Ā": 356,
423
+ "##ĭ": 357,
424
+ "##m": 358,
425
+ "##è": 359,
426
+ "###": 360,
427
+ "##L": 361,
428
+ "##S": 362,
429
+ "##[": 363,
430
+ "##ë": 364,
431
+ "##i": 365,
432
+ "##IJ": 366,
433
+ "##c": 367,
434
+ "##(": 368,
435
+ "##Ġ": 369,
436
+ "##5": 370,
437
+ "##ċ": 371,
438
+ "##ó": 372,
439
+ "##s": 373,
440
+ "##å": 374,
441
+ "##»": 375,
442
+ "##~": 376,
443
+ "##Ń": 377,
444
+ "##į": 378,
445
+ "##C": 379,
446
+ "##p": 380,
447
+ "##*": 381,
448
+ "##@": 382,
449
+ "##ę": 383,
450
+ "##ė": 384,
451
+ "##Ĝ": 385,
452
+ "##à": 386,
453
+ "##V": 387,
454
+ "##Č": 388,
455
+ "##ö": 389,
456
+ "##?": 390,
457
+ "##Ħ": 391,
458
+ "##ī": 392,
459
+ "##Ĭ": 393,
460
+ "##Ē": 394,
461
+ "##e": 395,
462
+ "##Ú": 396,
463
+ "##Ò": 397,
464
+ "##ô": 398,
465
+ "##£": 399,
466
+ "##)": 400,
467
+ "##ł": 401,
468
+ "##Ĺ": 402,
469
+ "##{": 403,
470
+ "##°": 404,
471
+ "##x": 405,
472
+ "##N": 406,
473
+ "##Ä": 407,
474
+ "##ģ": 408,
475
+ "##!": 409,
476
+ "##µ": 410,
477
+ "##Ă": 411,
478
+ "##·": 412,
479
+ "##ÿ": 413,
480
+ "##v": 414,
481
+ "##J": 415,
482
+ "##a": 416,
483
+ "##³": 417,
484
+ "##Õ": 418,
485
+ "##0": 419,
486
+ "##û": 420,
487
+ "##4": 421,
488
+ "##Ī": 422,
489
+ "##Ķ": 423,
490
+ "##ć": 424,
491
+ "##w": 425,
492
+ "##©": 426,
493
+ "##ĸ": 427,
494
+ "##Ĕ": 428,
495
+ "##2": 429,
496
+ "##ý": 430,
497
+ "##É": 431,
498
+ "##n": 432,
499
+ "##Ĩ": 433,
500
+ "##ij": 434,
501
+ "##/": 435,
502
+ "##l": 436,
503
+ "##Û": 437,
504
+ "##Ã": 438,
505
+ "##=": 439,
506
+ "##ľ": 440,
507
+ "##r": 441,
508
+ "##K": 442,
509
+ "##¤": 443,
510
+ "##Đ": 444,
511
+ "##²": 445,
512
+ "##ġ": 446,
513
+ "##Y": 447,
514
+ "##ù": 448,
515
+ "##Ł": 449,
516
+ "##,": 450,
517
+ "##Ô": 451,
518
+ "##þ": 452,
519
+ "##º": 453,
520
+ "##P": 454,
521
+ "##j": 455,
522
+ "##§": 456,
523
+ "##¹": 457,
524
+ "##Ë": 458,
525
+ "##Á": 459,
526
+ "##â": 460,
527
+ "##'": 461,
528
+ "##¥": 462,
529
+ "##A": 463,
530
+ "##Ê": 464,
531
+ "##ā": 465,
532
+ "##ß": 466,
533
+ "##Ė": 467,
534
+ "##Į": 468,
535
+ "##Ģ": 469,
536
+ "##B": 470,
537
+ "##Ċ": 471,
538
+ "##}": 472,
539
+ "##î": 473,
540
+ "##Ě": 474,
541
+ "##f": 475,
542
+ "##Ç": 476,
543
+ "##<": 477,
544
+ "##½": 478,
545
+ "##¡": 479,
546
+ "##W": 480,
547
+ "##t": 481,
548
+ "##đ": 482,
549
+ "##Ŀ": 483,
550
+ "##1": 484,
551
+ "##d": 485,
552
+ "##Ì": 486,
553
+ "##Ý": 487,
554
+ "##Ą": 488,
555
+ "##o": 489,
556
+ "##y": 490,
557
+ "##\"": 491,
558
+ "##¨": 492,
559
+ "##«": 493,
560
+ "##ĝ": 494,
561
+ "##ķ": 495,
562
+ "##;": 496,
563
+ "##Q": 497,
564
+ "##O": 498,
565
+ "##ĺ": 499,
566
+ "##\\": 500,
567
+ "##Ó": 501,
568
+ "##ĵ": 502,
569
+ "##|": 503,
570
+ "##Î": 504,
571
+ "##Æ": 505,
572
+ "##¼": 506,
573
+ "##Ĥ": 507,
574
+ "##Þ": 508,
575
+ "##Ļ": 509,
576
+ "##ĕ": 510,
577
+ "##D": 511,
578
+ "##ĉ": 512,
579
+ "##ŀ": 513,
580
  "<|unk|>": 514,
581
  "##ng": 515,
582
  "##�": 516,
fw57M_Surprisal_thresholdB_8064/vocab.json CHANGED
The diff for this file is too large to render. See raw diff