ssl-aasist / fairseq /examples /mms /tts /infer.py

Add files using upload-large-folder tool

9742bb8 verified 5 months ago

5.8 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	import re
	import glob
	import json
	import tempfile
	import math
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.utils.data import DataLoader
	import numpy as np
	import commons
	import utils
	import argparse
	import subprocess
	from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
	from models import SynthesizerTrn
	from scipy.io.wavfile import write

	class TextMapper(object):
	def __init__(self, vocab_file):
	self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
	self.SPACE_ID = self.symbols.index(" ")
	self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
	self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

	def text_to_sequence(self, text, cleaner_names):
	'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
	Args:
	text: string to convert to a sequence
	cleaner_names: names of the cleaner functions to run the text through
	Returns:
	List of integers corresponding to the symbols in the text
	'''
	sequence = []
	clean_text = text.strip()
	for symbol in clean_text:
	symbol_id = self._symbol_to_id[symbol]
	sequence += [symbol_id]
	return sequence

	def uromanize(self, text, uroman_pl):
	iso = "xxx"
	with tempfile.NamedTemporaryFile() as tf, \
	tempfile.NamedTemporaryFile() as tf2:
	with open(tf.name, "w") as f:
	f.write("\n".join([text]))
	cmd = f"perl " + uroman_pl
	cmd += f" -l {iso} "
	cmd += f" < {tf.name} > {tf2.name}"
	os.system(cmd)
	outtexts = []
	with open(tf2.name) as f:
	for line in f:
	line = re.sub(r"\s+", " ", line).strip()
	outtexts.append(line)
	outtext = outtexts[0]
	return outtext

	def get_text(self, text, hps):
	text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm

	def filter_oov(self, text, lang=None):
	text = self.preprocess_char(text, lang=lang)
	val_chars = self._symbol_to_id
	txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
	print(f"text after filtering OOV: {txt_filt}")
	return txt_filt

	def preprocess_char(self, text, lang=None):
	"""
	Special treatement of characters in certain languages
	"""
	if lang == "ron":
	text = text.replace("ț", "ţ")
	print(f"{lang} (ț -> ţ): {text}")
	return text

	def generate():
	parser = argparse.ArgumentParser(description='TTS inference')
	parser.add_argument('--model-dir', type=str, help='model checkpoint dir')
	parser.add_argument('--wav', type=str, help='output wav path')
	parser.add_argument('--txt', type=str, help='input text')
	parser.add_argument('--uroman-dir', type=str, default=None, help='uroman lib dir (will download if not specified)')
	parser.add_argument('--lang', type=str, default=None, help='language iso code (required for Romanian)')
	args = parser.parse_args()
	ckpt_dir, wav_path, txt = args.model_dir, args.wav, args.txt

	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")

	print(f"Run inference with {device}")
	vocab_file = f"{ckpt_dir}/vocab.txt"
	config_file = f"{ckpt_dir}/config.json"
	assert os.path.isfile(config_file), f"{config_file} doesn't exist"
	hps = utils.get_hparams_from_file(config_file)
	text_mapper = TextMapper(vocab_file)
	net_g = SynthesizerTrn(
	len(text_mapper.symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model)
	net_g.to(device)
	_ = net_g.eval()

	g_pth = f"{ckpt_dir}/G_100000.pth"
	print(f"load {g_pth}")

	_ = utils.load_checkpoint(g_pth, net_g, None)

	print(f"text: {txt}")
	is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
	if is_uroman:
	with tempfile.TemporaryDirectory() as tmp_dir:
	if args.uroman_dir is None:
	cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
	print(cmd)
	subprocess.check_output(cmd, shell=True)
	args.uroman_dir = tmp_dir
	uroman_pl = os.path.join(args.uroman_dir, "bin", "uroman.pl")
	print(f"uromanize")
	txt = text_mapper.uromanize(txt, uroman_pl)
	print(f"uroman text: {txt}")
	txt = txt.lower()
	txt = text_mapper.filter_oov(txt, lang=args.lang)
	stn_tst = text_mapper.get_text(txt, hps)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(device)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
	hyp = net_g.infer(
	x_tst, x_tst_lengths, noise_scale=.667,
	noise_scale_w=0.8, length_scale=1.0
	)[0][0,0].cpu().float().numpy()

	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	print(f"wav: {wav_path}")
	write(wav_path, hps.data.sampling_rate, hyp)
	return


	if __name__ == '__main__':
	generate()