from datasets import load_dataset
dataset = load_dataset("glue", "cola")
dataset = dataset["train"]  # Just take the training split for now

Next, load a tokenizer and tokenize the data as NumPy arrays.