Zeb commited on
Commit
6b0beee
·
1 Parent(s): 2e4b4eb

Fix tokenizers

Browse files
.gitattributes CHANGED
@@ -54,3 +54,4 @@ mutual-information_256000/special_tokens_map.json filter=lfs diff=lfs merge=lfs
54
  mutual-information_256000/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
55
  mutual-information_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
  fw57M_Entropy_threshold_128000/stats.csv filter=lfs diff=lfs merge=lfs -text
 
 
54
  mutual-information_256000/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
55
  mutual-information_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
  fw57M_Entropy_threshold_128000/stats.csv filter=lfs diff=lfs merge=lfs -text
57
+ frequencymulti_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
frequency_256000/merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
frequency_256000/merges_data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
frequency_256000/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed37c183a6fdf658a259f23b64ca57aab65a1b96dbcb8343b06e108fe6fbec55
3
- size 20274248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a19d76ccd8400f78de8cb1bb0cc81e948596a5a9337c90d739c81a7c59ce8b
3
+ size 19623518
frequency_256000/vocab.json CHANGED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_8064/stats.csv → frequencymulti_256000/merges.txt RENAMED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_32000/stats.csv → frequencymulti_256000/merges_data.csv RENAMED
The diff for this file is too large to render. See raw diff
 
frequencymulti_256000/special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eos_token": "<|endoftext|>",
3
+ "pad_token": "<|padding|>"
4
+ }
frequencymulti_256000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed37c183a6fdf658a259f23b64ca57aab65a1b96dbcb8343b06e108fe6fbec55
3
+ size 20274248
{fw57M_Entropy_thresholdBX_32000 → frequencymulti_256000}/tokenizer_config.json RENAMED
@@ -16,22 +16,14 @@
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
- },
20
- "259": {
21
- "content": "<|unk|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
  }
28
  },
29
- "bos_token": "<|endoftext|>",
30
  "clean_up_tokenization_spaces": false,
31
  "eos_token": "<|endoftext|>",
32
  "extra_special_tokens": {},
33
  "model_max_length": 1000000000000000019884624838656,
34
  "pad_token": "<|padding|>",
35
  "tokenizer_class": "PreTrainedTokenizer",
36
- "unk_token": "<|unk|>"
37
  }
 
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
 
 
 
 
 
 
 
 
19
  }
20
  },
21
+ "bos_token": null,
22
  "clean_up_tokenization_spaces": false,
23
  "eos_token": "<|endoftext|>",
24
  "extra_special_tokens": {},
25
  "model_max_length": 1000000000000000019884624838656,
26
  "pad_token": "<|padding|>",
27
  "tokenizer_class": "PreTrainedTokenizer",
28
+ "unk_token": null
29
  }
fw57M_Entropy_thresholdBX_16000/stats.csv → frequencymulti_256000/vocab.json RENAMED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_16000/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|padding|>",
5
- "unk_token": "<|unk|>"
6
- }
 
 
 
 
 
 
 
fw57M_Entropy_thresholdBX_16000/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_16000/tokenizer_config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "add_prefix_space": true,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "<|padding|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "<|endoftext|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "259": {
21
- "content": "<|unk|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- }
28
- },
29
- "bos_token": "<|endoftext|>",
30
- "clean_up_tokenization_spaces": false,
31
- "eos_token": "<|endoftext|>",
32
- "extra_special_tokens": {},
33
- "model_max_length": 1000000000000000019884624838656,
34
- "pad_token": "<|padding|>",
35
- "tokenizer_class": "PreTrainedTokenizer",
36
- "unk_token": "<|unk|>"
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fw57M_Entropy_thresholdBX_16000/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_32000/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|padding|>",
5
- "unk_token": "<|unk|>"
6
- }
 
 
 
 
 
 
 
fw57M_Entropy_thresholdBX_32000/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_32000/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_8064/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|padding|>",
5
- "unk_token": "<|unk|>"
6
- }
 
 
 
 
 
 
 
fw57M_Entropy_thresholdBX_8064/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
fw57M_Entropy_thresholdBX_8064/tokenizer_config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "add_prefix_space": true,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "<|padding|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "<|endoftext|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "259": {
21
- "content": "<|unk|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- }
28
- },
29
- "bos_token": "<|endoftext|>",
30
- "clean_up_tokenization_spaces": false,
31
- "eos_token": "<|endoftext|>",
32
- "extra_special_tokens": {},
33
- "model_max_length": 1000000000000000019884624838656,
34
- "pad_token": "<|padding|>",
35
- "tokenizer_class": "PreTrainedTokenizer",
36
- "unk_token": "<|unk|>"
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fw57M_Entropy_thresholdBX_8064/vocab.json DELETED
The diff for this file is too large to render. See raw diff