def extract_all_chars(batch):
     all_text = " ".join(batch["normalized_text"])
     vocab = list(set(all_text))
     return {"vocab": [vocab], "all_text": [all_text]}
vocabs = dataset.map(
     extract_all_chars,
     batched=True,
     batch_size=-1,
     keep_in_memory=True,
     remove_columns=dataset.column_names,
 )
dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer.