Spaces:

moustafa1-1
/

TTS_Gradio2

Sleeping

App Files Files Community

TTS_Gradio2 / app.py

moustafa1-1

Update app.py

c06ea85 verified 3 months ago

raw

history blame contribute delete

8.92 kB

	import sys
	import io, os, stat
	import subprocess
	import random
	from zipfile import ZipFile
	import uuid
	import time
	import torch
	import torchaudio


	#download for mecab
	os.system('python -m unidic download')

	# By using XTTS you agree to CPML license https://coqui.ai/cpml
	os.environ["COQUI_TOS_AGREED"] = "1"

	# langid is used to detect language for longer text
	# Most users expect text to be their own language, there is checkbox to disable it
	import langid
	import base64
	import csv
	from io import StringIO
	import datetime
	import re

	import gradio as gr
	from scipy.io.wavfile import write
	from pydub import AudioSegment

	from TTS.api import TTS
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from TTS.utils.generic_utils import get_user_data_dir
	from TTS.utils.manage import ModelManager

	HF_TOKEN = os.environ.get("HF_TOKEN")

	from huggingface_hub import HfApi
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # Import missing transformers components

	# will use api to restart space on a unrecoverable error
	api = HfApi(token=HF_TOKEN)
	repo_id = "coqui/xtts"

	# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
	print("Export newer ffmpeg binary for denoise filter")
	if not os.path.exists("ffmpeg"):
	ZipFile("ffmpeg.zip").extractall()
	print("Make ffmpeg binary executable")
	st = os.stat("ffmpeg")
	os.chmod("ffmpeg", st.st_mode \| stat.S_IEXEC)

	# Load XTTS model
	print("Loading TTS model...")
	model = None # Initialize model to None
	try:
	model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
	ModelManager().download_model(model_name)
	model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
	print("XTTS downloaded to:", model_path)

	config = XttsConfig()
	config.load_json(os.path.join(model_path, "config.json"))

	model = Xtts.init_from_config(config)
	model.load_checkpoint(
	config,
	checkpoint_path=os.path.join(model_path, "model.pth"),
	vocab_path=os.path.join(model_path, "vocab.json"),
	eval=True,
	use_deepspeed=False,
	)
	if torch.cuda.is_available():
	model.cuda()
	print("XTTS v2 model loaded successfully to CUDA.")
	else:
	print("XTTS v2 model loaded successfully to CPU.")
	except Exception as e:
	print(f"Error loading XTTS v2 model: {e}")

	# Load sentiment models
	print("Loading sentiment models...")
	sentiment_tokenizer = None
	sentiment_model = None
	sentiment_analyzer = None
	try:
	arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter"
	sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
	sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT")
	sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
	print("Sentiment models loaded.")
	except Exception as e:
	print(f"Error loading sentiment models: {e}")

	# Language detection (rest of your language detection function remains the same)
	def detect_language_safely(text):
	try:
	if any('\u0600' <= c <= '\u06FF' for c in text):
	return "ar"
	return langid.detect(text) # Use langid directly
	except:
	return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"

	# Sentiment to emotion mapping (rest of your mapping function remains the same)
	def map_sentiment_to_emotion(sentiment, language="en"):
	if language == "ar":
	return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral"
	return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral"

	# Simple Arabic sentiment analysis (rest of your Arabic sentiment analysis function remains the same)
	def arabic_sentiment_analysis(text):
	if sentiment_tokenizer is None or sentiment_model is None:
	return "neutral"

	pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"]
	neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"]
	pos_count = sum(1 for word in pos_words if word in text.lower())
	neg_count = sum(1 for word in neg_words if word in text.lower())

	if pos_count > neg_count:
	return "positive"
	elif neg_count > pos_count:
	return "negative"
	else:
	try:
	inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
	outputs = sentiment_model(**inputs)
	sentiment_class = torch.argmax(outputs.logits).item()
	return ["negative", "neutral", "positive"][sentiment_class]
	except Exception as e:
	print(f"Error during Arabic sentiment analysis: {e}")
	return "neutral"

	import tempfile
	import numpy as np

	def tts_interface(text_input, speaker_audio):
	print("--- tts_interface function called ---")
	print(f"Text Input: {text_input}")
	print(f"Speaker Audio: {speaker_audio}") # Log the tuple

	if model is None:
	print("Error: TTS model failed to load.")
	return "Error: TTS model failed to load. Check the logs."
	if speaker_audio is None:
	print("Error: Please upload a reference audio.")
	return "Error: Please upload a reference audio."

	language = detect_language_safely(text_input)
	emotion = "neutral"
	audio_output_path = "output.wav"
	print(f"Detected Language: {language}")

	if sentiment_analyzer is not None:
	try:
	print("Performing sentiment analysis...")
	if language == "en":
	sentiment_result = sentiment_analyzer(text_input)[0]
	emotion = map_sentiment_to_emotion(sentiment_result["label"])
	print(f"English Sentiment: {sentiment_result}, Emotion: {emotion}")
	else:
	sentiment_result = arabic_sentiment_analysis(text_input)
	emotion = map_sentiment_to_emotion(sentiment_result, language="ar")
	print(f"Arabic Sentiment: {sentiment_result}, Emotion: {emotion}")
	except Exception as e:
	print(f"Error during sentiment analysis: {e}")
	pass
	else:
	print("Sentiment analyzer not loaded.")

	temp_audio_file = None
	try:
	print("Attempting to generate audio using model.inference...")
	# Save the uploaded audio to a temporary file
	sampling_rate, audio_data = speaker_audio
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	temp_audio_path = tmp_file.name
	write(temp_audio_path, sampling_rate, audio_data)
	print(f"Temporary audio file saved to: {temp_audio_path}")
	temp_audio_file = temp_audio_path

	# Extract speaker embedding using the temporary file path
	try:
	(
	gpt_cond_latent,
	speaker_embedding,
	) = model.get_conditioning_latents(audio_path=temp_audio_file, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
	print("Speaker embedding extracted successfully.")
	except Exception as e:
	print("Speaker encoding error:", str(e))
	return f"Error during speaker encoding: {e}"

	# Perform inference
	out = model.inference(
	text=text_input,
	language=language,
	gpt_cond_latent=gpt_cond_latent,
	speaker_embedding=speaker_embedding,
	# emotion=emotion # Emotion handling might need further investigation
	)

	# Save the generated audio
	torchaudio.save(audio_output_path, torch.tensor(out["wav"]).unsqueeze(0).cpu(), 24000)
	print(f"Audio generated and saved to: {audio_output_path}")
	return audio_output_path
	except Exception as e:
	print(f"Error during TTS inference: {e}")
	return f"Error during TTS inference: {e}"
	finally:
	# Clean up the temporary audio file
	if temp_audio_file:
	os.remove(temp_audio_file)
	print(f"Temporary audio file removed: {temp_audio_file}")

	from scipy.io.wavfile import write # Ensure this import is present

	iface = gr.Interface(
	fn=tts_interface,
	inputs=[
	gr.Textbox(label="Enter Text"),
	gr.Audio(sources=["upload"], label="Upload Reference Audio"),
	],
	outputs=gr.Audio(label="Generated Audio", autoplay=True),
	title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion",
	description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).",
	)

	if __name__ == "__main__":
	iface.launch()