Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- .gitattributes +1 -0
- .gradio/certificate.pem +31 -0
- README.md +3 -9
- app/.DS_Store +0 -0
- app/.gradio/certificate.pem +31 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/config.py +10 -0
- app/main.py +80 -0
- app/main_legacy.py +187 -0
- app/scratch.py +17 -0
- app/utils/__init__.py +0 -0
- app/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- app/utils/__pycache__/chat_model.cpython-312.pyc +0 -0
- app/utils/__pycache__/med42.cpython-312.pyc +0 -0
- app/utils/__pycache__/speech_to_text.cpython-312.pyc +0 -0
- app/utils/__pycache__/text_to_speech.cpython-312.pyc +0 -0
- app/utils/chat_model.py +22 -0
- app/utils/med42.py +124 -0
- app/utils/speech_to_text.py +92 -0
- app/utils/text_to_speech.py +24 -0
- environment.yml +211 -0
- requirements.txt +10 -0
- test_output.wav +3 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
test_output.wav filter=lfs diff=lfs merge=lfs -text
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: gencent
|
3 |
+
app_file: app/main.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 5.9.1
|
|
|
|
|
6 |
---
|
|
|
|
app/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app/.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
app/__init__.py
ADDED
File without changes
|
app/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (146 Bytes). View file
|
|
app/config.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import torch
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
class Config:
|
8 |
+
SECRET_KEY = os.getenv('SECRET_KEY', 'your-secret-key')
|
9 |
+
MODEL_PATH = os.getenv('MODEL_PATH', 'mistralai/Mistral-7B-v0.1')
|
10 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
app/main.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
from utils.speech_to_text import SpeechRecognizer
|
4 |
+
from utils.text_to_speech import TextToSpeech
|
5 |
+
|
6 |
+
class VoiceChatApp:
|
7 |
+
def __init__(self):
|
8 |
+
self.speech_recognizer = SpeechRecognizer()
|
9 |
+
self.tts_engine = TextToSpeech()
|
10 |
+
self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Alex?"
|
11 |
+
self.chat_history = []
|
12 |
+
|
13 |
+
async def welcome_audio(self):
|
14 |
+
"""Generate and play the welcome message"""
|
15 |
+
sample_rate, audio_data = await self.tts_engine.synthesize(self.welcome_message)
|
16 |
+
audio_response = (sample_rate, audio_data.astype(np.int16))
|
17 |
+
self.chat_history.append((None, self.welcome_message))
|
18 |
+
return self.chat_history, audio_response
|
19 |
+
|
20 |
+
async def process_audio(self, audio, history):
|
21 |
+
"""Process user audio input and generate response"""
|
22 |
+
if audio is None:
|
23 |
+
return history, (24000, np.zeros(24000, dtype=np.int16)), None
|
24 |
+
|
25 |
+
# Speech to text
|
26 |
+
text_input = await self.speech_recognizer.transcribe(audio)
|
27 |
+
if not text_input:
|
28 |
+
return history, (24000, np.zeros(24000, dtype=np.int16)), None
|
29 |
+
|
30 |
+
# Generate response
|
31 |
+
response = "This is a test response. Please confirm if you can hear this clearly."
|
32 |
+
|
33 |
+
# Text to speech
|
34 |
+
sample_rate, audio_data = await self.tts_engine.synthesize(response)
|
35 |
+
audio_response = (sample_rate, audio_data.astype(np.int16))
|
36 |
+
|
37 |
+
# Update chat history
|
38 |
+
history.append((text_input, response))
|
39 |
+
|
40 |
+
return history, audio_response, None
|
41 |
+
|
42 |
+
def launch(self):
|
43 |
+
"""Launch the Gradio interface"""
|
44 |
+
with gr.Blocks(title="Voice-Enabled Chatbot") as interface:
|
45 |
+
with gr.Row():
|
46 |
+
with gr.Column(scale=2):
|
47 |
+
chatbot = gr.Chatbot(label="Chat History", height=400)
|
48 |
+
audio_input = gr.Audio(sources=["microphone"], type="numpy",
|
49 |
+
label="Speak Here", interactive=True)
|
50 |
+
audio_output = gr.Audio(label="Assistant Response", autoplay=True, elem_classes="compact-audio")
|
51 |
+
|
52 |
+
# Initial welcome message
|
53 |
+
interface.load(
|
54 |
+
fn=self.welcome_audio,
|
55 |
+
outputs=[chatbot, audio_output]
|
56 |
+
)
|
57 |
+
|
58 |
+
# Audio processing chain
|
59 |
+
audio_input.change(
|
60 |
+
fn=self.process_audio,
|
61 |
+
inputs=[audio_input, chatbot],
|
62 |
+
outputs=[chatbot, audio_output, audio_input],
|
63 |
+
api_name="process_audio"
|
64 |
+
).then(
|
65 |
+
lambda: None,
|
66 |
+
None,
|
67 |
+
audio_input,
|
68 |
+
queue=False
|
69 |
+
)
|
70 |
+
|
71 |
+
interface.launch(
|
72 |
+
server_name="127.0.0.1",
|
73 |
+
server_port=7860,
|
74 |
+
share=True,
|
75 |
+
debug=True
|
76 |
+
)
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
app = VoiceChatApp()
|
80 |
+
app.launch()
|
app/main_legacy.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils.speech_to_text import SpeechRecognizer
|
3 |
+
from utils.text_to_speech import TextToSpeech
|
4 |
+
import numpy as np
|
5 |
+
from utils.med42 import Med42
|
6 |
+
import time
|
7 |
+
|
8 |
+
class VoiceChatApp:
|
9 |
+
def __init__(self):
|
10 |
+
self.speech_recognizer = SpeechRecognizer()
|
11 |
+
self.tts_engine = TextToSpeech()
|
12 |
+
self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Aleks?"
|
13 |
+
self.chat_history = [] # Maintain persistent chat history
|
14 |
+
|
15 |
+
async def welcome_audio(self):
|
16 |
+
"""Generate and play the welcome message."""
|
17 |
+
tts_output = await self.tts_engine.synthesize(self.welcome_message)
|
18 |
+
|
19 |
+
# Extract audio data if the TTS returns a tuple (sample_rate, data)
|
20 |
+
if isinstance(tts_output, tuple) and len(tts_output) == 2:
|
21 |
+
_, audio_data = tts_output
|
22 |
+
else:
|
23 |
+
audio_data = tts_output
|
24 |
+
|
25 |
+
audio_response = self._normalize_audio(audio_data, 24000)
|
26 |
+
self.chat_history.append({"role": "assistant", "content": f"🤖 {self.welcome_message}"})
|
27 |
+
return self.chat_history, audio_response # Return chat history & audio
|
28 |
+
|
29 |
+
# Modify the process_audio method to handle TTS errors and add debug prints
|
30 |
+
async def process_audio(self, audio, state):
|
31 |
+
"""Process user audio, generate a response, and return updated chat history."""
|
32 |
+
if audio is None:
|
33 |
+
return state, (24000, np.zeros((24000,), dtype=np.int16)), None
|
34 |
+
|
35 |
+
# Convert speech to text
|
36 |
+
text_input = await self.speech_recognizer.transcribe(audio)
|
37 |
+
if not text_input:
|
38 |
+
print("I am here because recogniser did not work")
|
39 |
+
return state, (24000, np.zeros((24000,), dtype=np.int16)), None
|
40 |
+
|
41 |
+
# Generate response (simplified for debugging)
|
42 |
+
response = "This is a test response. Please confirm if you can hear this."
|
43 |
+
|
44 |
+
print(f"TTS Input Text: '{response}'") # Debug print
|
45 |
+
|
46 |
+
try:
|
47 |
+
# Attempt TTS synthesis
|
48 |
+
tts_output = await self.tts_engine.synthesize(response)
|
49 |
+
print(f"Raw TTS Output Type: {type(tts_output)}") # Debug type
|
50 |
+
|
51 |
+
# Extract audio data
|
52 |
+
if isinstance(tts_output, tuple):
|
53 |
+
sample_rate, audio_data = tts_output
|
54 |
+
|
55 |
+
print(f"Raw TTS data type: {type(audio_data)}") # Check container type
|
56 |
+
print(f"Raw TTS dtype: {audio_data.dtype}") # Check numerical type
|
57 |
+
print(f"Raw TTS min/max: {np.min(audio_data)}, {np.max(audio_data)}") # Verify range
|
58 |
+
|
59 |
+
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
|
60 |
+
else:
|
61 |
+
audio_data = tts_output
|
62 |
+
|
63 |
+
# Check for zeros in audio data
|
64 |
+
if np.all(audio_data == 0):
|
65 |
+
print("Warning: TTS generated silent audio!")
|
66 |
+
except Exception as e:
|
67 |
+
print(f"TTS Synthesis Error: {e}")
|
68 |
+
audio_data = np.zeros((24000,), dtype=np.float32)
|
69 |
+
|
70 |
+
# Capture both sample rate and audio data
|
71 |
+
if isinstance(tts_output, tuple) and len(tts_output) == 2:
|
72 |
+
sample_rate, audio_data = tts_output
|
73 |
+
else:
|
74 |
+
sample_rate = 24000 # Fallback
|
75 |
+
audio_data = tts_output
|
76 |
+
|
77 |
+
# Normalize while preserving sample rate
|
78 |
+
audio_response = self._normalize_audio(audio_data, sample_rate)
|
79 |
+
|
80 |
+
# Update chat history and return
|
81 |
+
messages = [
|
82 |
+
{"role": "user", "content": f"🎤 User said: {text_input}"},
|
83 |
+
{"role": "assistant", "content": f"🤖 {response}"}
|
84 |
+
]
|
85 |
+
state = state.copy() if state else []
|
86 |
+
state.extend(messages)
|
87 |
+
print(f"Final audio response - SR: {audio_response[0]}, Shape: {audio_response[1].shape}, Dtype: {audio_response[1].dtype}")
|
88 |
+
print(f"Final audio peaks: {np.max(np.abs(audio_response[1]))}") # Should be > 0
|
89 |
+
audio_response = (audio_response[0], audio_response[1], str(time.time()))
|
90 |
+
|
91 |
+
return state, audio_response, None
|
92 |
+
|
93 |
+
def _normalize_audio(self, audio_array, sample_rate):
|
94 |
+
"""Final format adjustment for Gradio compatibility"""
|
95 |
+
# Keep previous processing steps
|
96 |
+
if audio_array.dtype != np.int16:
|
97 |
+
audio_array = audio_array.astype(np.int16)
|
98 |
+
|
99 |
+
# Ensure 2D shape for mono audio (samples, 1)
|
100 |
+
if audio_array.ndim == 1:
|
101 |
+
audio_array = audio_array.reshape(-1, 1)
|
102 |
+
|
103 |
+
# Convert to 1D if mono
|
104 |
+
if audio_array.ndim == 2 and audio_array.shape[1] == 1:
|
105 |
+
audio_array = audio_array.flatten() # Shape becomes (N,)
|
106 |
+
|
107 |
+
return (sample_rate, audio_array)
|
108 |
+
|
109 |
+
# The rest of your code (launch method, etc.) remains unchanged
|
110 |
+
|
111 |
+
def launch(self):
|
112 |
+
"""Launch the Gradio interface with audio refresh workaround"""
|
113 |
+
with gr.Blocks(title="Voice-Enabled Chatbot", css=".autoplay-audio { display: none }") as interface:
|
114 |
+
with gr.Row():
|
115 |
+
with gr.Column(scale=2):
|
116 |
+
chatbot = gr.Chatbot(
|
117 |
+
label="Chat History",
|
118 |
+
type="messages",
|
119 |
+
height=400
|
120 |
+
)
|
121 |
+
audio_input = gr.Audio(
|
122 |
+
sources=["microphone"],
|
123 |
+
type="numpy",
|
124 |
+
label="Speak Here",
|
125 |
+
interactive=True # Ensure it stays interactive
|
126 |
+
)
|
127 |
+
audio_output = gr.Audio(
|
128 |
+
label="Assistant Response",
|
129 |
+
autoplay=True,
|
130 |
+
format="wav", # Explicit format
|
131 |
+
elem_id="audio-output" # Add ID for JS control
|
132 |
+
)
|
133 |
+
|
134 |
+
# Add JavaScript to force audio reload
|
135 |
+
interface.load(
|
136 |
+
None,
|
137 |
+
None,
|
138 |
+
None,
|
139 |
+
_js="""
|
140 |
+
() => {
|
141 |
+
function reloadAudio() {
|
142 |
+
const audio = document.querySelector('#audio-output audio');
|
143 |
+
if (audio) {
|
144 |
+
const source = audio.querySelector('source');
|
145 |
+
if (source) {
|
146 |
+
source.src += '#' + Date.now();
|
147 |
+
audio.load();
|
148 |
+
}
|
149 |
+
}
|
150 |
+
}
|
151 |
+
setInterval(reloadAudio, 500);
|
152 |
+
}
|
153 |
+
"""
|
154 |
+
)
|
155 |
+
|
156 |
+
# State for managing chat history
|
157 |
+
state = gr.State([])
|
158 |
+
|
159 |
+
# On page load, play welcome message and show initial chat history
|
160 |
+
interface.load(
|
161 |
+
fn=self.welcome_audio,
|
162 |
+
outputs=[chatbot, audio_output]
|
163 |
+
)
|
164 |
+
|
165 |
+
# When user speaks, process audio and update the chat
|
166 |
+
audio_input.change(
|
167 |
+
fn=self.process_audio,
|
168 |
+
inputs=[audio_input, state],
|
169 |
+
outputs=[chatbot, audio_output, audio_input],
|
170 |
+
api_name="process_audio"
|
171 |
+
).then(
|
172 |
+
lambda: None,
|
173 |
+
None,
|
174 |
+
audio_input,
|
175 |
+
queue=False
|
176 |
+
)
|
177 |
+
|
178 |
+
interface.launch(
|
179 |
+
server_name="127.0.0.1",
|
180 |
+
server_port=7860,
|
181 |
+
share=True,
|
182 |
+
debug=True
|
183 |
+
)
|
184 |
+
|
185 |
+
if __name__ == "__main__":
|
186 |
+
app = VoiceChatApp()
|
187 |
+
app.launch()
|
app/scratch.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from utils.text_to_speech import TextToSpeech # Update with your actual module path
|
3 |
+
import soundfile as sf
|
4 |
+
|
5 |
+
async def main():
|
6 |
+
tts = TextToSpeech()
|
7 |
+
text = "This is a test voice output"
|
8 |
+
|
9 |
+
# Get audio data
|
10 |
+
sample_rate, audio_data = await tts.synthesize(text)
|
11 |
+
|
12 |
+
# Save to file (WAV format)
|
13 |
+
sf.write("test_output.wav", audio_data, sample_rate)
|
14 |
+
print("Saved test_output.wav")
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
asyncio.run(main())
|
app/utils/__init__.py
ADDED
File without changes
|
app/utils/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (152 Bytes). View file
|
|
app/utils/__pycache__/chat_model.cpython-312.pyc
ADDED
Binary file (1.61 kB). View file
|
|
app/utils/__pycache__/med42.cpython-312.pyc
ADDED
Binary file (5.42 kB). View file
|
|
app/utils/__pycache__/speech_to_text.cpython-312.pyc
ADDED
Binary file (5.38 kB). View file
|
|
app/utils/__pycache__/text_to_speech.cpython-312.pyc
ADDED
Binary file (1.58 kB). View file
|
|
app/utils/chat_model.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class ChatModel:
|
5 |
+
def __init__(self):
|
6 |
+
self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", token=True)
|
7 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
8 |
+
"mistralai/Mistral-7B-v0.1",
|
9 |
+
torch_dtype=torch.float16,
|
10 |
+
token=True
|
11 |
+
)
|
12 |
+
|
13 |
+
async def generate_response(self, input_text):
|
14 |
+
inputs = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)
|
15 |
+
outputs = self.model.generate(
|
16 |
+
**inputs,
|
17 |
+
max_length=100,
|
18 |
+
num_return_sequences=1,
|
19 |
+
temperature=0.7
|
20 |
+
)
|
21 |
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
22 |
+
return response
|
app/utils/med42.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import relevant packages
|
2 |
+
import requests
|
3 |
+
import yaml
|
4 |
+
from typing import List, Union
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
|
8 |
+
class Med42():
|
9 |
+
"""
|
10 |
+
A class for interacting with the Med42 API
|
11 |
+
"""
|
12 |
+
def __init__(self, base_endpoint: str = "https://dev-openai-api.med42.ai/",
|
13 |
+
prompt_filepath: Union[str, Path, None] = "./prompts.yaml"):
|
14 |
+
"""
|
15 |
+
Initialise the Med42 API caller object
|
16 |
+
|
17 |
+
Args:
|
18 |
+
base_endpoint: URL address API calls are hosted. Defaults to "http://dev-openai-api.med42.ai/".
|
19 |
+
prompt_filepath: Filepath to prompts catalogue in yaml format. Defaults to "./prompts.yaml".
|
20 |
+
"""
|
21 |
+
self.base_endpoint = base_endpoint
|
22 |
+
self.docs_endpoint = self.base_endpoint + "docs"
|
23 |
+
# self.available_models = self._list_model_ids()
|
24 |
+
self.prompt_catalogue = load_yaml(prompt_filepath) if prompt_filepath is not None else None
|
25 |
+
|
26 |
+
|
27 |
+
def __repr__(self):
|
28 |
+
"""
|
29 |
+
Returns: Description of this object class including a link to API docs page
|
30 |
+
"""
|
31 |
+
return f"Med42 API calls available, see docs at {self.docs_endpoint}"
|
32 |
+
|
33 |
+
|
34 |
+
def list_models(self) -> List[dict]:
|
35 |
+
"""
|
36 |
+
Pull information on all the models currently hosted on API page
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
List of models available and their details
|
40 |
+
"""
|
41 |
+
try:
|
42 |
+
response = requests.get(self.base_endpoint + "v1/models")
|
43 |
+
response.raise_for_status() # raise an exception for HTTP errors
|
44 |
+
return response.json()['data']
|
45 |
+
|
46 |
+
except requests.RequestException as e:
|
47 |
+
print("Error: ", e)
|
48 |
+
|
49 |
+
|
50 |
+
def _list_model_ids(self) -> List[str]:
|
51 |
+
"""
|
52 |
+
Returns: List of currently available model ids
|
53 |
+
"""
|
54 |
+
return [model_entry['id'] for model_entry in self.list_models()]
|
55 |
+
|
56 |
+
|
57 |
+
def _update_prompt_catalogue(self, prompt_filepath: Union[str, Path] = "./prompts.yaml") -> dict:
|
58 |
+
"""
|
59 |
+
Read in catalogue of prompts (recorded in a yaml file) and update attribute
|
60 |
+
|
61 |
+
Args:
|
62 |
+
prompt_filepath: filepath to catalogue of prompts to be read. Defaults to "./prompts.yaml"
|
63 |
+
"""
|
64 |
+
self.prompt_catalogue = load_yaml(prompt_filepath)
|
65 |
+
print(f"Prompt catalogue updated to be the prompts from {prompt_filepath}")
|
66 |
+
|
67 |
+
|
68 |
+
def chat_completion(self, system_instruct: str, user_instruct: str, llm_model: str = "Llama3-Med42-70B-32k",
|
69 |
+
temp: float = 0.7, generate_log: bool = False) -> str:
|
70 |
+
"""
|
71 |
+
Uses chat completion functionality
|
72 |
+
|
73 |
+
Args:
|
74 |
+
system_instruct: System instruction
|
75 |
+
user_instruct: User instruction
|
76 |
+
llm_model: Language model to use. Defaults to "Mixtral-8x7B-Instruct-v0.1"
|
77 |
+
temp: Temperature parameter for generating responses. Defaults to 0.7
|
78 |
+
generate_log: Whether to log the response or not. Defaults to False
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
LLM output if log is False, otherwise a dictionary containing system instruction, user instruction, and LLM call response
|
82 |
+
"""
|
83 |
+
return "hardcoded response because Med42 is not working I don't think. Just making it longer"
|
84 |
+
data = {"model": llm_model, "messages": [{"role": "system", "content": system_instruct},
|
85 |
+
{"role": "user", "content": f"{user_instruct}"}], "temperature": temp}
|
86 |
+
|
87 |
+
headers = {"Content-Type": "application/json"}
|
88 |
+
|
89 |
+
try:
|
90 |
+
response = requests.post(self.base_endpoint + "v1/chat/completions", json=data, headers=headers)
|
91 |
+
response.raise_for_status()
|
92 |
+
if not generate_log:
|
93 |
+
return response.json()["choices"][0]["message"]["content"] # LLM chat completion output
|
94 |
+
else:
|
95 |
+
# LLM logged response plus chat completion output (tuple outputted)
|
96 |
+
return {"system_instruction": system_instruct, "user_instruction": user_instruct, "llm_call_response": response.json()}, response.json()["choices"][0]["message"]["content"]
|
97 |
+
except requests.RequestException as e:
|
98 |
+
print("Error: ", e)
|
99 |
+
|
100 |
+
|
101 |
+
# Util functions
|
102 |
+
def load_yaml(yaml_filepath: str | Path) -> dict:
|
103 |
+
"""
|
104 |
+
Read in YAML files
|
105 |
+
|
106 |
+
Args:
|
107 |
+
yaml_filepath: filepath to yaml file to be read
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Contents of the YAML file
|
111 |
+
"""
|
112 |
+
try:
|
113 |
+
with open(yaml_filepath, 'r') as file:
|
114 |
+
data = yaml.safe_load(file)
|
115 |
+
return data
|
116 |
+
except FileNotFoundError:
|
117 |
+
raise FileNotFoundError(f"File not found: {yaml_filepath}")
|
118 |
+
except IOError as e:
|
119 |
+
raise IOError(f"Error reading file: {e}")
|
120 |
+
except yaml.YAMLError as e:
|
121 |
+
raise ValueError(f"Error parsing YAML file: {e}")
|
122 |
+
|
123 |
+
if __name__ == "__main__":
|
124 |
+
llm_caller = Med42()
|
app/utils/speech_to_text.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
class ASRConfig:
|
7 |
+
"""Configuration class for ASR transcription."""
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
model_id="openai/whisper-large-v2",
|
11 |
+
language="english",
|
12 |
+
sampling_rate=16000,
|
13 |
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
14 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
15 |
+
):
|
16 |
+
self.model_id = model_id
|
17 |
+
self.language = language
|
18 |
+
self.sampling_rate = sampling_rate
|
19 |
+
self.device = device
|
20 |
+
self.torch_dtype = torch_dtype
|
21 |
+
|
22 |
+
class SpeechRecognizer:
|
23 |
+
def __init__(self, config: ASRConfig = None):
|
24 |
+
self.config = config if config else ASRConfig()
|
25 |
+
print(f"Using ASR configuration: {self.config.__dict__}")
|
26 |
+
self._setup_model()
|
27 |
+
|
28 |
+
def _setup_model(self):
|
29 |
+
"""Initialize Whisper model and processor."""
|
30 |
+
try:
|
31 |
+
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
32 |
+
self.config.model_id,
|
33 |
+
torch_dtype=self.config.torch_dtype,
|
34 |
+
use_safetensors=True,
|
35 |
+
).to(self.config.device)
|
36 |
+
|
37 |
+
self.processor = AutoProcessor.from_pretrained(self.config.model_id)
|
38 |
+
self.pipe = pipeline(
|
39 |
+
"automatic-speech-recognition",
|
40 |
+
model=self.model,
|
41 |
+
tokenizer=self.processor.tokenizer,
|
42 |
+
feature_extractor=self.processor.feature_extractor,
|
43 |
+
torch_dtype=self.config.torch_dtype,
|
44 |
+
device=self.config.device,
|
45 |
+
)
|
46 |
+
except Exception as e:
|
47 |
+
raise RuntimeError(f"Failed to set up Whisper model: {str(e)}")
|
48 |
+
|
49 |
+
async def transcribe(self, audio: tuple, prompt: str = None) -> str:
|
50 |
+
"""
|
51 |
+
Transcribes the provided audio using the Whisper pipeline.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
audio (tuple): A tuple containing (sample_rate, audio_array).
|
55 |
+
prompt (str): An optional text prompt to guide transcription.
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
str: Transcription of the audio.
|
59 |
+
"""
|
60 |
+
if not audio or len(audio) != 2:
|
61 |
+
raise ValueError("Invalid audio input. Expected a tuple (sample_rate, audio_array).")
|
62 |
+
|
63 |
+
try:
|
64 |
+
# Extract the raw audio data (audio_array) from the input tuple
|
65 |
+
sample_rate, audio_array = audio
|
66 |
+
|
67 |
+
# Ensure the audio is a numpy array and has the expected format
|
68 |
+
if not isinstance(audio_array, np.ndarray):
|
69 |
+
raise TypeError(f"Expected numpy.ndarray for audio data, got {type(audio_array)}")
|
70 |
+
|
71 |
+
# Ensure the audio array is in floating-point format
|
72 |
+
if audio_array.dtype != np.float32:
|
73 |
+
audio_array = audio_array.astype(np.float32) / np.iinfo(audio_array.dtype).max
|
74 |
+
|
75 |
+
# Resample audio if the sample rate differs from the configured rate
|
76 |
+
if sample_rate != self.config.sampling_rate:
|
77 |
+
import librosa
|
78 |
+
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=self.config.sampling_rate)
|
79 |
+
|
80 |
+
# Prepare generate_kwargs for the pipeline
|
81 |
+
generate_kwargs = {}
|
82 |
+
if self.config.language:
|
83 |
+
generate_kwargs["language"] = self.config.language
|
84 |
+
if prompt:
|
85 |
+
prompt_ids = self.processor.get_prompt_ids(prompt, return_tensors="pt").to(self.config.device)
|
86 |
+
generate_kwargs["prompt_ids"] = prompt_ids
|
87 |
+
|
88 |
+
# Run transcription through the pipeline
|
89 |
+
result = self.pipe(audio_array, generate_kwargs=generate_kwargs)
|
90 |
+
return result["text"].strip()
|
91 |
+
except Exception as e:
|
92 |
+
raise RuntimeError(f"Transcription failed: {str(e)}")
|
app/utils/text_to_speech.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import edge_tts
|
2 |
+
import io
|
3 |
+
import numpy as np
|
4 |
+
from pydub import AudioSegment # Install with: pip install pydub
|
5 |
+
|
6 |
+
class TextToSpeech:
|
7 |
+
def __init__(self, voice="en-US-AriaNeural"):
|
8 |
+
self.voice = voice
|
9 |
+
|
10 |
+
async def synthesize(self, text):
|
11 |
+
communicate = edge_tts.Communicate(text, self.voice)
|
12 |
+
|
13 |
+
# Collect raw audio bytes
|
14 |
+
audio_bytes = bytearray()
|
15 |
+
async for chunk in communicate.stream():
|
16 |
+
if chunk["type"] == "audio":
|
17 |
+
audio_bytes.extend(chunk["data"])
|
18 |
+
|
19 |
+
# Convert to numpy array using pydub
|
20 |
+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
|
21 |
+
samples = np.array(audio.get_array_of_samples())
|
22 |
+
sample_rate = audio.frame_rate
|
23 |
+
|
24 |
+
return (sample_rate, samples)
|
environment.yml
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: gencent_env
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- conda-forge
|
5 |
+
- defaults
|
6 |
+
dependencies:
|
7 |
+
- aiohappyeyeballs=2.4.4=py312hca03da5_0
|
8 |
+
- aiohttp=3.11.10=py312h80987f9_0
|
9 |
+
- aiosignal=1.2.0=pyhd3eb1b0_0
|
10 |
+
- altair=5.0.1=py312hca03da5_0
|
11 |
+
- arrow-cpp=16.1.0=hbc20fb2_0
|
12 |
+
- attrs=24.2.0=py312hca03da5_0
|
13 |
+
- aws-c-auth=0.6.19=h80987f9_0
|
14 |
+
- aws-c-cal=0.5.20=h80987f9_0
|
15 |
+
- aws-c-common=0.8.5=h80987f9_0
|
16 |
+
- aws-c-compression=0.2.16=h80987f9_0
|
17 |
+
- aws-c-event-stream=0.2.15=h313beb8_0
|
18 |
+
- aws-c-http=0.6.25=h80987f9_0
|
19 |
+
- aws-c-io=0.13.10=h80987f9_0
|
20 |
+
- aws-c-mqtt=0.7.13=h80987f9_0
|
21 |
+
- aws-c-s3=0.1.51=h80987f9_0
|
22 |
+
- aws-c-sdkutils=0.1.6=h80987f9_0
|
23 |
+
- aws-checksums=0.1.13=h80987f9_0
|
24 |
+
- aws-crt-cpp=0.18.16=h313beb8_0
|
25 |
+
- aws-sdk-cpp=1.10.55=h313beb8_0
|
26 |
+
- blas=1.0=openblas
|
27 |
+
- blinker=1.6.2=py312hca03da5_0
|
28 |
+
- boost-cpp=1.82.0=h48ca7d4_2
|
29 |
+
- bottleneck=1.4.2=py312ha86b861_0
|
30 |
+
- brotli-python=1.0.9=py312h313beb8_8
|
31 |
+
- bzip2=1.0.8=h80987f9_6
|
32 |
+
- c-ares=1.19.1=h80987f9_0
|
33 |
+
- ca-certificates=2024.11.26=hca03da5_0
|
34 |
+
- cachetools=5.3.3=py312hca03da5_0
|
35 |
+
- certifi=2024.12.14=py312hca03da5_0
|
36 |
+
- cffi=1.17.1=py312h3eb5a62_0
|
37 |
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
38 |
+
- click=8.1.7=py312hca03da5_0
|
39 |
+
- datasets=3.2.0=pyhd8ed1ab_0
|
40 |
+
- dill=0.3.8=py312hca03da5_0
|
41 |
+
- filelock=3.13.1=py312hca03da5_0
|
42 |
+
- freetype=2.12.1=h1192e45_0
|
43 |
+
- frozenlist=1.5.0=py312h80987f9_0
|
44 |
+
- fsspec=2024.6.1=py312hca03da5_0
|
45 |
+
- gettext=0.22.5=h8414b35_3
|
46 |
+
- gettext-tools=0.22.5=h8414b35_3
|
47 |
+
- gflags=2.2.2=h313beb8_1
|
48 |
+
- gitdb=4.0.7=pyhd3eb1b0_0
|
49 |
+
- gitpython=3.1.43=py312hca03da5_0
|
50 |
+
- glog=0.5.0=h313beb8_1
|
51 |
+
- huggingface_hub=0.26.5=pyhd8ed1ab_1
|
52 |
+
- icu=73.1=h313beb8_0
|
53 |
+
- idna=3.7=py312hca03da5_0
|
54 |
+
- jinja2=3.1.4=py312hca03da5_1
|
55 |
+
- jpeg=9e=h80987f9_3
|
56 |
+
- jsonschema=4.23.0=py312hca03da5_0
|
57 |
+
- jsonschema-specifications=2023.7.1=py312hca03da5_0
|
58 |
+
- krb5=1.20.1=hf3e1bf2_1
|
59 |
+
- lame=3.100=h1a28f6b_0
|
60 |
+
- lcms2=2.12=hba8e193_0
|
61 |
+
- lerc=3.0=hc377ac9_0
|
62 |
+
- libabseil=20240116.2=cxx17_h313beb8_0
|
63 |
+
- libasprintf=0.22.5=h8414b35_3
|
64 |
+
- libasprintf-devel=0.22.5=h8414b35_3
|
65 |
+
- libboost=1.82.0=h0bc93f9_2
|
66 |
+
- libbrotlicommon=1.0.9=h80987f9_8
|
67 |
+
- libbrotlidec=1.0.9=h80987f9_8
|
68 |
+
- libbrotlienc=1.0.9=h80987f9_8
|
69 |
+
- libcurl=8.9.1=h3e2b118_0
|
70 |
+
- libcxx=19.1.6=ha82da77_1
|
71 |
+
- libdeflate=1.17=h80987f9_1
|
72 |
+
- libedit=3.1.20230828=h80987f9_0
|
73 |
+
- libev=4.33=h1a28f6b_1
|
74 |
+
- libevent=2.1.12=h02f6b3c_1
|
75 |
+
- libexpat=2.6.3=hf9b8971_0
|
76 |
+
- libffi=3.4.4=hca03da5_1
|
77 |
+
- libflac=1.4.3=hb765f3a_0
|
78 |
+
- libgettextpo=0.22.5=h8414b35_3
|
79 |
+
- libgettextpo-devel=0.22.5=h8414b35_3
|
80 |
+
- libgfortran=5.0.0=11_3_0_hca03da5_28
|
81 |
+
- libgfortran5=11.3.0=h009349e_28
|
82 |
+
- libgrpc=1.62.2=h62f6fdd_0
|
83 |
+
- libiconv=1.17=h0d3ecfb_2
|
84 |
+
- libintl=0.22.5=h8414b35_3
|
85 |
+
- libintl-devel=0.22.5=h8414b35_3
|
86 |
+
- libnghttp2=1.57.0=h62f6fdd_0
|
87 |
+
- libogg=1.3.5=h1a28f6b_1
|
88 |
+
- libopenblas=0.3.21=h269037a_0
|
89 |
+
- libopus=1.3.1=h80987f9_1
|
90 |
+
- libpng=1.6.39=h80987f9_0
|
91 |
+
- libprotobuf=4.25.3=h514c7bf_0
|
92 |
+
- libsndfile=1.2.2=h9739721_1
|
93 |
+
- libsqlite=3.46.0=hfb93653_0
|
94 |
+
- libssh2=1.11.0=h3e2b118_0
|
95 |
+
- libthrift=0.15.0=h73c2103_2
|
96 |
+
- libtiff=4.5.1=h313beb8_0
|
97 |
+
- libvorbis=1.3.7=h1a28f6b_0
|
98 |
+
- libwebp-base=1.3.2=h80987f9_1
|
99 |
+
- libzlib=1.2.13=hfb2fe0b_6
|
100 |
+
- llvm-openmp=14.0.6=hc6e5704_0
|
101 |
+
- lz4-c=1.9.4=h313beb8_1
|
102 |
+
- markdown-it-py=2.2.0=py312hca03da5_1
|
103 |
+
- markupsafe=2.1.3=py312h80987f9_0
|
104 |
+
- mdurl=0.1.0=py312hca03da5_0
|
105 |
+
- mpg123=1.32.9=hf642e45_0
|
106 |
+
- mpmath=1.3.0=py312hca03da5_0
|
107 |
+
- multidict=6.1.0=py312h80987f9_0
|
108 |
+
- multiprocess=0.70.15=py312hca03da5_0
|
109 |
+
- ncurses=6.4=h313beb8_0
|
110 |
+
- networkx=3.3=py312hca03da5_0
|
111 |
+
- numexpr=2.10.1=py312h5d9532f_0
|
112 |
+
- numpy=1.26.4=py312h7f4fdc5_0
|
113 |
+
- numpy-base=1.26.4=py312he047099_0
|
114 |
+
- openjpeg=2.5.2=h54b8e55_0
|
115 |
+
- openssl=3.4.0=h39f12f2_0
|
116 |
+
- orc=2.0.1=h937ddfc_0
|
117 |
+
- packaging=24.1=py312hca03da5_0
|
118 |
+
- pandas=2.2.2=py312hd77ebd4_0
|
119 |
+
- pillow=10.4.0=py312h80987f9_0
|
120 |
+
- pip=24.2=py312hca03da5_0
|
121 |
+
- portaudio=19.7.0=h5833ebf_0
|
122 |
+
- propcache=0.2.0=py312h80987f9_0
|
123 |
+
- protobuf=4.25.3=py312h8472c4a_0
|
124 |
+
- pyarrow=16.1.0=py312hd77ebd4_0
|
125 |
+
- pycparser=2.21=pyhd3eb1b0_0
|
126 |
+
- pydeck=0.8.0=py312hca03da5_2
|
127 |
+
- pygments=2.15.1=py312hca03da5_1
|
128 |
+
- pysocks=1.7.1=py312hca03da5_0
|
129 |
+
- pysoundfile=0.12.1=pyhd8ed1ab_3
|
130 |
+
- python=3.12.2=hdf0ec26_0_cpython
|
131 |
+
- python-dateutil=2.9.0post0=py312hca03da5_2
|
132 |
+
- python-dotenv=1.0.1=pyhd8ed1ab_1
|
133 |
+
- python-sounddevice=0.5.0=pyhd8ed1ab_0
|
134 |
+
- python-tzdata=2023.3=pyhd3eb1b0_0
|
135 |
+
- python-xxhash=2.0.2=py312h80987f9_1
|
136 |
+
- python_abi=3.12=5_cp312
|
137 |
+
- pytorch=2.5.1=py3.12_0
|
138 |
+
- pytz=2024.1=py312hca03da5_0
|
139 |
+
- pyyaml=6.0.2=py312h80987f9_0
|
140 |
+
- re2=2022.04.01=hc377ac9_0
|
141 |
+
- readline=8.2=h1a28f6b_0
|
142 |
+
- referencing=0.30.2=py312hca03da5_0
|
143 |
+
- regex=2024.9.11=py312h80987f9_0
|
144 |
+
- requests=2.32.3=py312hca03da5_1
|
145 |
+
- rich=13.7.1=py312hca03da5_0
|
146 |
+
- rpds-py=0.10.6=py312h2aea54e_1
|
147 |
+
- safetensors=0.4.5=py312h7805bc0_1
|
148 |
+
- setuptools=75.1.0=py312hca03da5_0
|
149 |
+
- six=1.16.0=pyhd3eb1b0_1
|
150 |
+
- smmap=4.0.0=pyhd3eb1b0_0
|
151 |
+
- snappy=1.2.1=h313beb8_0
|
152 |
+
- sqlite=3.45.3=h80987f9_0
|
153 |
+
- streamlit=1.38.0=py312hca03da5_0
|
154 |
+
- tenacity=8.2.3=py312hca03da5_0
|
155 |
+
- tk=8.6.14=h6ba3021_0
|
156 |
+
- tokenizers=0.21.0=py312hf3e4074_0
|
157 |
+
- toml=0.10.2=pyhd3eb1b0_0
|
158 |
+
- toolz=0.12.0=py312hca03da5_0
|
159 |
+
- tornado=6.4.1=py312h80987f9_0
|
160 |
+
- tqdm=4.66.5=py312h989b03a_0
|
161 |
+
- transformers=4.47.1=pyhd8ed1ab_0
|
162 |
+
- tzdata=2024b=h04d1e81_0
|
163 |
+
- urllib3=2.2.3=py312hca03da5_0
|
164 |
+
- utf8proc=2.6.1=h80987f9_1
|
165 |
+
- wheel=0.44.0=py312hca03da5_0
|
166 |
+
- xxhash=0.8.0=h1a28f6b_3
|
167 |
+
- xz=5.4.6=h80987f9_1
|
168 |
+
- yaml=0.2.5=h1a28f6b_0
|
169 |
+
- yarl=1.18.0=py312h80987f9_0
|
170 |
+
- zlib=1.2.13=hfb2fe0b_6
|
171 |
+
- zstd=1.5.6=hfb09047_0
|
172 |
+
- pip:
|
173 |
+
- aiofiles==23.2.1
|
174 |
+
- annotated-types==0.7.0
|
175 |
+
- anyio==4.7.0
|
176 |
+
- edge-tts==7.0.0
|
177 |
+
- fastapi==0.115.6
|
178 |
+
- ffmpy==0.5.0
|
179 |
+
- gradio==5.9.1
|
180 |
+
- gradio-client==1.5.2
|
181 |
+
- h11==0.14.0
|
182 |
+
- httpcore==1.0.7
|
183 |
+
- httpx==0.28.1
|
184 |
+
- hyperpyyaml==1.2.2
|
185 |
+
- joblib==1.4.2
|
186 |
+
- orjson==3.10.13
|
187 |
+
- pydantic==2.10.4
|
188 |
+
- pydantic-core==2.27.2
|
189 |
+
- pydub==0.25.1
|
190 |
+
- python-multipart==0.0.20
|
191 |
+
- ruamel-yaml==0.18.7
|
192 |
+
- ruamel-yaml-clib==0.2.12
|
193 |
+
- ruff==0.8.4
|
194 |
+
- safehttpx==0.1.6
|
195 |
+
- scipy==1.14.1
|
196 |
+
- semantic-version==2.10.0
|
197 |
+
- sentencepiece==0.2.0
|
198 |
+
- shellingham==1.5.4
|
199 |
+
- sniffio==1.3.1
|
200 |
+
- speechbrain==1.0.2
|
201 |
+
- srt==3.5.3
|
202 |
+
- starlette==0.41.3
|
203 |
+
- sympy==1.13.1
|
204 |
+
- tabulate==0.9.0
|
205 |
+
- tomlkit==0.13.2
|
206 |
+
- torchaudio==2.5.1
|
207 |
+
- typer==0.15.1
|
208 |
+
- typing-extensions==4.12.2
|
209 |
+
- uvicorn==0.34.0
|
210 |
+
- websockets==14.1
|
211 |
+
prefix: /Users/hamza/miniconda3/envs/gencent_env
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Not sure version numbers here are actually correct
|
2 |
+
gradio==3.50.2
|
3 |
+
speechbrain==1.0.2
|
4 |
+
transformers==4.21.0
|
5 |
+
python-dotenv==0.19.0
|
6 |
+
sounddevice==0.4.3
|
7 |
+
soundfile==0.10.3.post1
|
8 |
+
edge-tts==4.0.0
|
9 |
+
# flask==2.0.1
|
10 |
+
torch==2.3.1
|
test_output.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f1260f6a6c0583fec3824fc8851c44c1adf2ac7b630fa09d9cb38dc65b5286c
|
3 |
+
size 130220
|