replacements = [ | |
("à", "a"), | |
("ç", "c"), | |
("è", "e"), | |
("ë", "e"), | |
("í", "i"), | |
("ï", "i"), | |
("ö", "o"), | |
("ü", "u"), | |
] | |
def cleanup_text(inputs): | |
for src, dst in replacements: | |
inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst) | |
return inputs | |
dataset = dataset.map(cleanup_text) | |
Now that you have dealt with special characters in the text, it's time to shift focus to the audio data. |