Moustafa1111111111 commited on
Commit
cdd1279
·
1 Parent(s): 38d1b63

Add TTS gradio

Browse files
Files changed (2) hide show
  1. app.py +123 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from TTS.api import TTS
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
5
+ from langdetect import detect
6
+
7
+ # Allowlist XttsConfig so torch.load doesn't raise UnpicklingError
8
+ from torch.serialization import add_safe_globals
9
+ from TTS.tts.configs.xtts_config import XttsConfig
10
+ add_safe_globals([XttsConfig])
11
+
12
+ # ✅ Monkey-patch torch.load to always use weights_only=False
13
+ _original_torch_load = torch.load
14
+ def patched_torch_load(*args, **kwargs):
15
+ kwargs["weights_only"] = False
16
+ return _original_torch_load(*args, **kwargs)
17
+ torch.load = patched_torch_load
18
+
19
+ print("Loading TTS model from Hugging Face Hub...")
20
+ try:
21
+ tts = TTS("tts_models/multilingual/multi-dataset-xtts_v2").to("cuda" if torch.cuda.is_available() else "cpu")
22
+ print("XTTS v2 model loaded successfully from Hugging Face Hub.")
23
+ except Exception as e:
24
+ print(f"Error loading XTTS v2 model from Hugging Face Hub: {e}")
25
+ tts = None # Set tts to None if loading fails
26
+
27
+ print("Loading sentiment models...")
28
+ try:
29
+ arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter"
30
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
31
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT")
32
+ sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
33
+ print("Sentiment models loaded.")
34
+ except Exception as e:
35
+ print(f"Error loading sentiment models: {e}")
36
+ sentiment_analyzer = None
37
+
38
+ # Language detection
39
+ def detect_language_safely(text):
40
+ try:
41
+ if any('\u0600' <= c <= '\u06FF' for c in text):
42
+ return "ar"
43
+ return detect(text)
44
+ except:
45
+ return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
46
+
47
+ # Sentiment to emotion mapping
48
+ def map_sentiment_to_emotion(sentiment, language="en"):
49
+ if language == "ar":
50
+ return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral"
51
+ return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral"
52
+
53
+ # Simple Arabic sentiment analysis
54
+ def arabic_sentiment_analysis(text):
55
+ if sentiment_tokenizer is None or sentiment_model is None:
56
+ return "neutral" # Return neutral if models failed to load
57
+
58
+ pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"]
59
+ neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"]
60
+ pos_count = sum(1 for word in pos_words if word in text.lower())
61
+ neg_count = sum(1 for word in neg_words if word in text.lower())
62
+
63
+ if pos_count > neg_count:
64
+ return "positive"
65
+ elif neg_count > pos_count:
66
+ return "negative"
67
+ else:
68
+ try:
69
+ inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
70
+ outputs = sentiment_model(**inputs)
71
+ sentiment_class = torch.argmax(outputs.logits).item()
72
+ return ["negative", "neutral", "positive"][sentiment_class]
73
+ except Exception as e:
74
+ print(f"Error during Arabic sentiment analysis: {e}")
75
+ return "neutral"
76
+
77
+ def tts_interface(text_input, speaker_audio):
78
+ if tts is None:
79
+ return "Error: TTS model failed to load. Check the logs."
80
+ if speaker_audio is None:
81
+ return "Error: Please upload a reference audio."
82
+
83
+ language = detect_language_safely(text_input)
84
+ emotion = "neutral"
85
+ audio_output_path = "output.wav"
86
+
87
+ if sentiment_analyzer is not None:
88
+ try:
89
+ if language == "en":
90
+ sentiment_result = sentiment_analyzer(text_input)[0]
91
+ emotion = map_sentiment_to_emotion(sentiment_result["label"])
92
+ else:
93
+ sentiment_result = arabic_sentiment_analysis(text_input)
94
+ emotion = map_sentiment_to_emotion(sentiment_result, language="ar")
95
+ except Exception as e:
96
+ print(f"Error during sentiment analysis: {e}")
97
+ pass
98
+
99
+ try:
100
+ tts.tts_to_file(
101
+ text=text_input,
102
+ file_path=audio_output_path,
103
+ emotion=emotion,
104
+ speaker_wav=speaker_audio,
105
+ language=language
106
+ )
107
+ return audio_output_path
108
+ except Exception as e:
109
+ return f"Error during TTS: {e}"
110
+
111
+ iface = gr.Interface(
112
+ fn=tts_interface,
113
+ inputs=[
114
+ gr.Textbox(label="Enter Text"),
115
+ gr.Audio(source="upload", label="Upload Reference Audio"),
116
+ ],
117
+ outputs=gr.Audio(label="Generated Audio", autoplay=True),
118
+ title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion",
119
+ description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).",
120
+ )
121
+
122
+ if __name__ == "__main__":
123
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ TTS
4
+ transformers
5
+ langdetect
6
+ pydantic
7
+ accelerate # Often needed by transformers
8
+ sentencepiece # Often needed by multilingual models