thon | |
from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel | |
from datasets import load_dataset | |
import torch | |
load a fine-tuned speech translation model and corresponding processor | |
model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") | |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") | |
let's perform inference on a piece of English speech (which we'll translate to German) | |
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") | |
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values | |
autoregressively generate transcription (uses greedy decoding by default) | |
generated_ids = model.generate(input_values) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
print(generated_text) | |
Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können. |