Groovy-123 commited on
Commit
c37bfa2
·
verified ·
1 Parent(s): 135f8bb

Create tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +19 -0
tokenizer.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
2
+
3
+ # Create a BPE tokenizer
4
+ tokenizer = Tokenizer(models.BPE())
5
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
6
+ tokenizer.decoder = decoders.ByteLevel()
7
+
8
+ # Train on your text data
9
+ trainer = trainers.BpeTrainer(
10
+ vocab_size=30000,
11
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
12
+ )
13
+
14
+ # Replace 'train.txt' with your text file containing all training data
15
+ tokenizer.train(files=["train.txt"], trainer=trainer)
16
+
17
+ # Save the tokenizer.json
18
+ tokenizer.save("tokenizer.json")
19
+ print("tokenizer.json is ready!")