from datasets import load_dataset, Audio English stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True) stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) en_sample = next(iter(stream_data))["audio"]["array"] Arabic stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test", streaming=True) stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) ar_sample = next(iter(stream_data))["audio"]["array"] Next, we load the model and processor from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor import torch model_id = "facebook/mms-lid-126" processor = AutoFeatureExtractor.from_pretrained(model_id) model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id) Now we process the audio data, pass the processed audio data to the model to classify it into a language, just like we usually do for Wav2Vec2 audio classification models such as ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition English inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs).logits lang_id = torch.argmax(outputs, dim=-1)[0].item() detected_lang = model.config.id2label[lang_id] 'eng' Arabic inputs = processor(ar_sample, sampling_rate=16_000, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs).logits lang_id = torch.argmax(outputs, dim=-1)[0].item() detected_lang = model.config.id2label[lang_id] 'ara' To see all the supported languages of a checkpoint, you can print out the language ids as follows: py processor.id2label.values() Audio Pretrained Models Pretrained models are available for two different sizes - 300M , 1Bil.