{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "okQdUOf2ovBS" }, "source": [ "#Running MMS-TTS inference in Colab\n", "In this notebook, we give an example on how to run text-to-speech inference using MMS TTS models. \n", "\n", "By default, we run inference on a GPU. If you want to perform CPU inference, go to \"Runtiime\" menu -> \"Change runtime type\" and set \"Hardware accelerator\" to \"None\" before running." ] }, { "cell_type": "markdown", "metadata": { "id": "XK2jXLmEpgK5" }, "source": [ "## 1. Preliminaries\n", "This section installs necessary python packages for the other sections. Run it first." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "vGyb3dGWpmks", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "9825fea8-d247-48d9-b33b-dbff36e905fa" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'vits'...\n", "remote: Enumerating objects: 81, done.\u001b[K\n", "remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81\u001b[K\n", "Unpacking objects: 100% (81/81), 3.33 MiB | 2.44 MiB/s, done.\n", "Python 3.10.11\n", "/content/vits\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting Cython==0.29.21\n", " Downloading Cython-0.29.21-py2.py3-none-any.whl (974 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m974.2/974.2 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: Cython\n", " Attempting uninstall: Cython\n", " Found existing installation: Cython 0.29.34\n", " Uninstalling Cython-0.29.34:\n", " Successfully uninstalled Cython-0.29.34\n", "Successfully installed Cython-0.29.21\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting librosa==0.8.0\n", " Downloading librosa-0.8.0.tar.gz (183 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m183.9/183.9 kB\u001b[0m \u001b[31m15.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (3.0.0)\n", "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (1.22.4)\n", "Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (1.10.1)\n", "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (1.2.2)\n", "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (1.2.0)\n", "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (4.4.2)\n", "Collecting resampy>=0.2.2 (from librosa==0.8.0)\n", " Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m101.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numba>=0.43.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (0.56.4)\n", "Requirement already satisfied: soundfile>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (0.12.1)\n", "Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.8.0) (1.6.0)\n", "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.43.0->librosa==0.8.0) (0.39.1)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba>=0.43.0->librosa==0.8.0) (67.7.2)\n", "Requirement already satisfied: appdirs>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.8.0) (1.4.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.8.0) (23.1)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.8.0) (2.27.1)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa==0.8.0) (3.1.0)\n", "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.9.0->librosa==0.8.0) (1.15.1)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.9.0->librosa==0.8.0) (2.21)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.8.0) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.8.0) (2022.12.7)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.8.0) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.8.0) (3.4)\n", "Building wheels for collected packages: librosa\n", " Building wheel for librosa (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for librosa: filename=librosa-0.8.0-py3-none-any.whl size=201378 sha256=c299b7ae3d6d527a4889716009ab27ca4018546d04f0e4de1019ea919311c0dc\n", " Stored in directory: /root/.cache/pip/wheels/bf/b7/85/2f8044306ccec014930aea23ad4852fca9e2584e21c6972bc6\n", "Successfully built librosa\n", "Installing collected packages: resampy, librosa\n", " Attempting uninstall: librosa\n", " Found existing installation: librosa 0.10.0.post2\n", " Uninstalling librosa-0.10.0.post2:\n", " Successfully uninstalled librosa-0.10.0.post2\n", "Successfully installed librosa-0.8.0 resampy-0.4.2\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting phonemizer==2.2.1\n", " Downloading phonemizer-2.2.1-py3-none-any.whl (49 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.0/49.0 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from phonemizer==2.2.1) (1.2.0)\n", "Collecting segments (from phonemizer==2.2.1)\n", " Downloading segments-2.2.1-py2.py3-none-any.whl (15 kB)\n", "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.10/dist-packages (from phonemizer==2.2.1) (23.1.0)\n", "Collecting clldutils>=1.7.3 (from segments->phonemizer==2.2.1)\n", " Downloading clldutils-3.19.0-py2.py3-none-any.whl (1.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting csvw>=1.5.6 (from segments->phonemizer==2.2.1)\n", " Downloading csvw-3.1.3-py2.py3-none-any.whl (56 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.7/56.7 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from segments->phonemizer==2.2.1) (2022.10.31)\n", "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer==2.2.1) (2.8.2)\n", "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer==2.2.1) (0.8.10)\n", "Collecting colorlog (from clldutils>=1.7.3->segments->phonemizer==2.2.1)\n", " Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n", "Collecting pylatexenc (from clldutils>=1.7.3->segments->phonemizer==2.2.1)\n", " Downloading pylatexenc-2.10.tar.gz (162 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.6/162.6 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer==2.2.1) (3.4.3)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer==2.2.1) (4.9.2)\n", "Requirement already satisfied: markupsafe in /usr/local/lib/python3.10/dist-packages (from clldutils>=1.7.3->segments->phonemizer==2.2.1) (2.1.2)\n", "Requirement already satisfied: babel in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer==2.2.1) (2.12.1)\n", "Collecting colorama (from csvw>=1.5.6->segments->phonemizer==2.2.1)\n", " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", "Collecting isodate (from csvw>=1.5.6->segments->phonemizer==2.2.1)\n", " Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: jsonschema in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer==2.2.1) (4.3.3)\n", "Collecting language-tags (from csvw>=1.5.6->segments->phonemizer==2.2.1)\n", " Downloading language_tags-1.2.0-py3-none-any.whl (213 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.4/213.4 kB\u001b[0m \u001b[31m29.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting rdflib (from csvw>=1.5.6->segments->phonemizer==2.2.1)\n", " Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m528.1/528.1 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer==2.2.1) (2.27.1)\n", "Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer==2.2.1)\n", " Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)\n", "Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from csvw>=1.5.6->segments->phonemizer==2.2.1) (4.1.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from isodate->csvw>=1.5.6->segments->phonemizer==2.2.1) (1.16.0)\n", "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer==2.2.1) (0.19.3)\n", "Requirement already satisfied: pyparsing<4,>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer==2.2.1) (3.0.9)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer==2.2.1) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer==2.2.1) (2022.12.7)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer==2.2.1) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer==2.2.1) (3.4)\n", "Building wheels for collected packages: pylatexenc\n", " Building wheel for pylatexenc (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136820 sha256=e99eecd0f55e1827ac73565fc43f5565f432aca243434ea921e0a31c5827331d\n", " Stored in directory: /root/.cache/pip/wheels/d3/31/8b/e09b0386afd80cfc556c00408c9aeea5c35c4d484a9c762fd5\n", "Successfully built pylatexenc\n", "Installing collected packages: rfc3986, pylatexenc, language-tags, isodate, colorlog, colorama, rdflib, clldutils, csvw, segments, phonemizer\n", "Successfully installed clldutils-3.19.0 colorama-0.4.6 colorlog-6.7.0 csvw-3.1.3 isodate-0.6.1 language-tags-1.2.0 phonemizer-2.2.1 pylatexenc-2.10 rdflib-6.3.2 rfc3986-1.5.0 segments-2.2.1\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (1.10.1)\n", "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scipy) (1.22.4)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.22.4)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.15.2+cu118)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.22.4)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.27.1)\n", "Requirement already satisfied: torch==2.0.1 in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.0.1+cu118)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (8.4.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (3.12.0)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (4.5.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (1.11.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (3.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1->torchvision) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1->torchvision) (3.25.2)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1->torchvision) (16.0.5)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2022.12.7)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.0.1->torchvision) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.0.1->torchvision) (1.3.0)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.7.1)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.0.7)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.11.0)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.39.3)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.4)\n", "Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (23.1)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (8.4.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.0.9)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting Unidecode==1.1.1\n", " Downloading Unidecode-1.1.1-py2.py3-none-any.whl (238 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.3/238.3 kB\u001b[0m \u001b[31m18.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: Unidecode\n", "Successfully installed Unidecode-1.1.1\n", "/content/vits/monotonic_align\n", "Compiling core.pyx because it changed.\n", "[1/1] Cythonizing core.pyx\n", "/usr/local/lib/python3.10/dist-packages/Cython/Compiler/Main.py:369: FutureWarning: Cython directive 'language_level' not set, using 2 for now (Py2). This will change in a later release! File: /content/vits/monotonic_align/core.pyx\n", " tree = Parsing.p_module(s, pxd, full_module_name)\n", "\u001b[01m\u001b[Kcore.c:\u001b[m\u001b[K In function ‘\u001b[01m\u001b[K__Pyx_InitGlobals\u001b[m\u001b[K’:\n", "\u001b[01m\u001b[Kcore.c:16766:1:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[K‘\u001b[01m\u001b[KPyEval_InitThreads\u001b[m\u001b[K’ is deprecated [\u001b[01;35m\u001b[K-Wdeprecated-declarations\u001b[m\u001b[K]\n", "16766 | \u001b[01;35m\u001b[KPyEval_InitThreads\u001b[m\u001b[K();\n", " | \u001b[01;35m\u001b[K^~~~~~~~~~~~~~~~~~\u001b[m\u001b[K\n", "In file included from \u001b[01m\u001b[K/usr/include/python3.10/Python.h:130\u001b[m\u001b[K,\n", " from \u001b[01m\u001b[Kcore.c:16\u001b[m\u001b[K:\n", "\u001b[01m\u001b[K/usr/include/python3.10/ceval.h:122:37:\u001b[m\u001b[K \u001b[01;36m\u001b[Knote: \u001b[m\u001b[Kdeclared here\n", " 122 | Py_DEPRECATED(3.9) PyAPI_FUNC(void) \u001b[01;36m\u001b[KPyEval_InitThreads\u001b[m\u001b[K(void);\n", " | \u001b[01;36m\u001b[K^~~~~~~~~~~~~~~~~~\u001b[m\u001b[K\n", "/content/vits\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "'/content/vits'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 1 } ], "source": [ "%pwd\n", "!git clone https://github.com/jaywalnut310/vits.git\n", "!python --version\n", "%cd vits/\n", "\n", "!pip install Cython==0.29.21\n", "!pip install librosa==0.8.0\n", "!pip install phonemizer==2.2.1\n", "!pip install scipy\n", "!pip install numpy\n", "!pip install torch\n", "!pip install torchvision\n", "!pip install matplotlib\n", "!pip install Unidecode==1.1.1\n", "\n", "%cd monotonic_align/\n", "%mkdir monotonic_align\n", "!python3 setup.py build_ext --inplace\n", "%cd ../\n", "%pwd" ] }, { "cell_type": "markdown", "metadata": { "id": "KuBzieKbuJKN" }, "source": [ "## 2. Choose a language and download its checkpoint\n", "Find the ISO code for your target language [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html). You can find more details about the languages we currently support for TTS in this [table](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "UtEeQcmwuUaG", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2adfb7eb-b9a2-44c3-8571-72fbc4b60aff" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Download model for language: eng\n", "Model checkpoints in ./eng: ['G_100000.pth', 'config.json', 'vocab.txt']\n" ] } ], "source": [ "import os\n", "import subprocess\n", "import locale\n", "locale.getpreferredencoding = lambda: \"UTF-8\"\n", "\n", "def download(lang, tgt_dir=\"./\"):\n", " lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)\n", " cmd = \";\".join([\n", " f\"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}\",\n", " f\"tar zxvf {lang_fn}\"\n", " ])\n", " print(f\"Download model for language: {lang}\")\n", " subprocess.check_output(cmd, shell=True)\n", " print(f\"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}\")\n", " return lang_dir\n", "\n", "LANG = \"eng\"\n", "ckpt_dir = download(LANG)" ] }, { "cell_type": "markdown", "source": [ "## 3. Load the checkpoint" ], "metadata": { "id": "zexlezYiSWMb" } }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "Sxi3CXmGqH6r", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "24710ada-6f04-4f29-c5f2-000458784ed8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Run inference with cuda\n", "load ./eng/G_100000.pth\n" ] } ], "source": [ "from IPython.display import Audio\n", "import os\n", "import re\n", "import glob\n", "import json\n", "import tempfile\n", "import math\n", "import torch\n", "from torch import nn\n", "from torch.nn import functional as F\n", "from torch.utils.data import DataLoader\n", "import numpy as np\n", "import commons\n", "import utils\n", "import argparse\n", "import subprocess\n", "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n", "from models import SynthesizerTrn\n", "from scipy.io.wavfile import write\n", "\n", "def preprocess_char(text, lang=None):\n", " \"\"\"\n", " Special treatement of characters in certain languages\n", " \"\"\"\n", " print(lang)\n", " if lang == 'ron':\n", " text = text.replace(\"ț\", \"ţ\")\n", " return text\n", "\n", "class TextMapper(object):\n", " def __init__(self, vocab_file):\n", " self.symbols = [x.replace(\"\\n\", \"\") for x in open(vocab_file, encoding=\"utf-8\").readlines()]\n", " self.SPACE_ID = self.symbols.index(\" \")\n", " self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}\n", " self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}\n", "\n", " def text_to_sequence(self, text, cleaner_names):\n", " '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.\n", " Args:\n", " text: string to convert to a sequence\n", " cleaner_names: names of the cleaner functions to run the text through\n", " Returns:\n", " List of integers corresponding to the symbols in the text\n", " '''\n", " sequence = []\n", " clean_text = text.strip()\n", " for symbol in clean_text:\n", " symbol_id = self._symbol_to_id[symbol]\n", " sequence += [symbol_id]\n", " return sequence\n", "\n", " def uromanize(self, text, uroman_pl):\n", " iso = \"xxx\"\n", " with tempfile.NamedTemporaryFile() as tf, \\\n", " tempfile.NamedTemporaryFile() as tf2:\n", " with open(tf.name, \"w\") as f:\n", " f.write(\"\\n\".join([text]))\n", " cmd = f\"perl \" + uroman_pl\n", " cmd += f\" -l {iso} \"\n", " cmd += f\" < {tf.name} > {tf2.name}\"\n", " os.system(cmd)\n", " outtexts = []\n", " with open(tf2.name) as f:\n", " for line in f:\n", " line = re.sub(r\"\\s+\", \" \", line).strip()\n", " outtexts.append(line)\n", " outtext = outtexts[0]\n", " return outtext\n", "\n", " def get_text(self, text, hps):\n", " text_norm = self.text_to_sequence(text, hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = torch.LongTensor(text_norm)\n", " return text_norm\n", "\n", " def filter_oov(self, text):\n", " val_chars = self._symbol_to_id\n", " txt_filt = \"\".join(list(filter(lambda x: x in val_chars, text)))\n", " print(f\"text after filtering OOV: {txt_filt}\")\n", " return txt_filt\n", "\n", "def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):\n", " txt = preprocess_char(txt, lang=lang)\n", " is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'\n", " if is_uroman:\n", " with tempfile.TemporaryDirectory() as tmp_dir:\n", " if uroman_dir is None:\n", " cmd = f\"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}\"\n", " print(cmd)\n", " subprocess.check_output(cmd, shell=True)\n", " uroman_dir = tmp_dir\n", " uroman_pl = os.path.join(uroman_dir, \"bin\", \"uroman.pl\")\n", " print(f\"uromanize\")\n", " txt = text_mapper.uromanize(txt, uroman_pl)\n", " print(f\"uroman text: {txt}\")\n", " txt = txt.lower()\n", " txt = text_mapper.filter_oov(txt)\n", " return txt\n", "\n", "if torch.cuda.is_available():\n", " device = torch.device(\"cuda\")\n", "else:\n", " device = torch.device(\"cpu\")\n", "\n", "print(f\"Run inference with {device}\")\n", "vocab_file = f\"{ckpt_dir}/vocab.txt\"\n", "config_file = f\"{ckpt_dir}/config.json\"\n", "assert os.path.isfile(config_file), f\"{config_file} doesn't exist\"\n", "hps = utils.get_hparams_from_file(config_file)\n", "text_mapper = TextMapper(vocab_file)\n", "net_g = SynthesizerTrn(\n", " len(text_mapper.symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " **hps.model)\n", "net_g.to(device)\n", "_ = net_g.eval()\n", "\n", "g_pth = f\"{ckpt_dir}/G_100000.pth\"\n", "print(f\"load {g_pth}\")\n", "\n", "_ = utils.load_checkpoint(g_pth, net_g, None)" ] }, { "cell_type": "markdown", "source": [ "## 4. Generate an audio given text\n", "Specify the sentence you want to synthesize and generate the audio" ], "metadata": { "id": "fIiwaWl6SiVy" } }, { "cell_type": "code", "source": [ "txt = \"Expanding the language coverage of speech technology has the potential to improve access to information for many more people\"\n", "\n", "print(f\"text: {txt}\")\n", "txt = preprocess_text(txt, text_mapper, hps, lang=LANG)\n", "stn_tst = text_mapper.get_text(txt, hps)\n", "with torch.no_grad():\n", " x_tst = stn_tst.unsqueeze(0).to(device)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)\n", " hyp = net_g.infer(\n", " x_tst, x_tst_lengths, noise_scale=.667,\n", " noise_scale_w=0.8, length_scale=1.0\n", " )[0][0,0].cpu().float().numpy()\n", "\n", "print(f\"Generated audio\") \n", "Audio(hyp, rate=hps.data.sampling_rate)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 165 }, "id": "mpSvjfSCGBDm", "outputId": "142581f8-e9ec-4d17-d4da-413176e3cee3" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "text: Expanding the language coverage of speech technology has the potential to improve access to information for many more people\n", "eng\n", "text after filtering OOV: expanding the language coverage of speech technology has the potential to improve access to information for many more people\n", "Generated audio\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " " ] }, "metadata": {}, "execution_count": 4 } ] } ], "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }