source_lang = "en" | |
target_lang = "fr" | |
prefix = "translate English to French: " | |
def preprocess_function(examples): | |
inputs = [prefix + example[source_lang] for example in examples["translation"]] | |
targets = [example[target_lang] for example in examples["translation"]] | |
model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) | |
return model_inputs | |
To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] method. |