# Running MMS-LID inference in Colab

## Step 1: Clone fairseq-py and install latest version

In [1]:
import os

!git clone https://github.com/pytorch/fairseq

# Change current working directory
!pwd
%cd "/content/fairseq"
!pip install --editable ./ 
!pip install tensorboardX


fatal: destination path 'fairseq' already exists and is not an empty directory.
/content
/content/fairseq
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: fairseq
  Building editable for fairseq (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fairseq: filename=fairseq-0.12.2-0.editable-cp310-cp310-linux_x86_64.whl size=9219 sha256=b6289e3715902d34fd7c54490679210a5be155dd4416754f0e8c376f193b5ac4
  Stored in directory: /tmp/pip-ephem-wheel-cache-o62sj_ry/wheels/c6/d7/db/bc419b1daa8266aa8de2a7c4d29f62dbfa814e8701fe4695a2
Successfully built fairseq
Installing collected packages: fairseq
  Attempting uninsta

## 2. Download MMS-LID model



In [5]:
available_models = ["l126", "l256", "l512", "l1024", "l2048", "l4017"]

# We will use L126 model which can recognize 126 languages 
model_name = available_models[0] # l126
print(f"Using model - {model_name}")
print(f"Visit https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}_langs.html to check all the languages supported by this model.")

! mkdir -p /content/models_lid
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}.pt'
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt'



Using model - l126
Visit https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126_langs.html to check all the languages supported by this model.
--2023-05-25 18:18:45--  https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 52.84.251.15, 52.84.251.114, 52.84.251.27, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|52.84.251.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3856229421 (3.6G) [binary/octet-stream]
Saving to: ‘/content/models_lid/l126/mms1b_l126.pt’


2023-05-25 18:19:09 (155 MB/s) - ‘/content/models_lid/l126/mms1b_l126.pt’ saved [3856229421/3856229421]

--2023-05-25 18:19:09--  https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 52.84.251.15, 52.84.251.114, 52.84.251.27, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|52.84.251.15|:443... connected.
HTTP request sent, awaiting response..

## 3. Prepare manifest files
Create a folder on path '/content/audio_samples/' and upload your .wav audio files that you need to recognize e.g. '/content/audio_samples/abc.wav' , '/content/audio_samples/def.wav' etc...

Note: You need to make sure that the audio data you are using has a sample rate of 16kHz You can easily do this with FFMPEG like the example below that converts .mp3 file to .flac and fixing the audio sample rate

Here, we use three examples - one audio file from English, Hindi, Chinese each. 

In [6]:
! mkdir -p /content/audio_samples/
for key in ["en_us", "hi_in", "cmn_hans_cn"]:
  !wget -O /content/audio_samples/tmp.mp3 https://datasets-server.huggingface.co/assets/google/fleurs/--/{key}/train/0/audio/audio.mp3
  !ffmpeg -hide_banner -loglevel error -y -i   /content/audio_samples/tmp.mp3 -ar 16000 /content/audio_samples/{key}.wav

! mkdir -p /content/audio_samples/


--2023-05-25 18:19:09--  https://datasets-server.huggingface.co/assets/google/fleurs/--/en_us/train/0/audio/audio.mp3
Resolving datasets-server.huggingface.co (datasets-server.huggingface.co)... 34.200.186.24, 44.197.252.161, 54.165.66.147, ...
Connecting to datasets-server.huggingface.co (datasets-server.huggingface.co)|34.200.186.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20853 (20K) [audio/mpeg]
Saving to: ‘/content/audio_samples/tmp.mp3’


2023-05-25 18:19:11 (92.8 KB/s) - ‘/content/audio_samples/tmp.mp3’ saved [20853/20853]

--2023-05-25 18:19:12--  https://datasets-server.huggingface.co/assets/google/fleurs/--/hi_in/train/0/audio/audio.mp3
Resolving datasets-server.huggingface.co (datasets-server.huggingface.co)... 34.200.186.24, 44.197.252.161, 54.165.66.147, ...
Connecting to datasets-server.huggingface.co (datasets-server.huggingface.co)|34.200.186.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26361 (26K) [audio/

In [7]:
! mkdir -p /content/manifest/
import os
with open("/content/manifest/dev.tsv", "w") as ftsv, open("/content/manifest/dev.lang", "w") as flang:
  ftsv.write("/\n")

  for fl in os.listdir("/content/audio_samples/"):
    if not fl.endswith(".wav"):
      continue
    audio_path = f"/content/audio_samples/{fl}"
    # duration should be number of samples in audio. For inference, using a random value should be fine. 
    duration = 1234 
    ftsv.write(f"{audio_path}\t{duration}\n")
    flang.write("eng\n") # This is the "true" language for the audio. For inference, using a random value should be fine. 


# 4: Run Inference and transcribe your audio(s)


In [8]:
import os

os.environ["PYTHONPATH"] = "/content/fairseq"
os.environ["PREFIX"] = "INFER"
os.environ["HYDRA_FULL_ERROR"] = "1"
os.environ["USER"] = "mms_lid_user"

!python3 examples/mms/lid/infer.py /content/models_lid/{model_name} --path /content/models_lid/{model_name}/mms1b_l126.pt \
  --task audio_classification  --infer-manifest /content/manifest/dev.tsv --output-path /content/manifest/

2023-05-25 18:19:19.545731: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
| loading model from /content/models_lid/l126/mms1b_l126.pt
2023-05-25 18:19:29 | INFO | fairseq.tasks.audio_classification | Using dict_path : /content/models_lid/l126/dict.lang.txt
2023-05-25 18:19:29 | INFO | root | === Number of labels = 126
2023-05-25 18:20:01 | INFO | fairseq.data.audio.raw_audio_dataset | loaded 3, skipped 0 samples
2023-05-25 18:20:01 | INFO | fairseq.tasks.fairseq_task | can_reuse_epoch_itr = True
2023-05-25 18:20:01 | INFO | fairseq.tasks.fairseq_task | reuse_dataloader = True
2023-05-25 18:20:01 | INFO | fairseq.tasks.fairseq_task | rebuild_batches = True
2023-05-25 18:20:01 | INFO | fairseq.tasks.fairseq_task | batches will be rebuilt for each

In [9]:
print("----- INPUT FILES -----")
! tail -n +2 /content/manifest/dev.tsv

print("\n----- TOP-K PREDICTONS WITH SCORE -----")
! cat /content/manifest//predictions.txt

----- INPUT FILES -----
/content/audio_samples/hi_in.wav	1234
/content/audio_samples/en_us.wav	1234
/content/audio_samples/cmn_hans_cn.wav	1234

----- TOP-K PREDICTONS WITH SCORE -----
[["hin", 0.9931250810623169], ["urd", 0.005808886140584946], ["snd", 0.0005312535213306546]]
[["eng", 0.9989539980888367], ["fas", 0.00036296260077506304], ["haw", 7.031611312413588e-05]]
[["cmn", 0.9996059536933899], ["bod", 0.0002111078501911834], ["kor", 9.211552242049947e-05]]
