def extract_all_chars(batch): | |
all_text = " ".join(batch["normalized_text"]) | |
vocab = list(set(all_text)) | |
return {"vocab": [vocab], "all_text": [all_text]} | |
vocabs = dataset.map( | |
extract_all_chars, | |
batched=True, | |
batch_size=-1, | |
keep_in_memory=True, | |
remove_columns=dataset.column_names, | |
) | |
dataset_vocab = set(vocabs["vocab"][0]) | |
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()} | |
Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. |