Spaces:

hjaved202
/

gencent

Runtime error

App Files Files Community

hjaved202 commited on Feb 11

Commit

a350173

verified ·

1 Parent(s): 8d0b512

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.DS_Store +0 -0
.gitattributes +1 -0
.gradio/certificate.pem +31 -0
README.md +3 -9
app/.DS_Store +0 -0
app/.gradio/certificate.pem +31 -0
app/__init__.py +0 -0
app/__pycache__/__init__.cpython-312.pyc +0 -0
app/config.py +10 -0
app/main.py +80 -0
app/main_legacy.py +187 -0
app/scratch.py +17 -0
app/utils/__init__.py +0 -0
app/utils/__pycache__/__init__.cpython-312.pyc +0 -0
app/utils/__pycache__/chat_model.cpython-312.pyc +0 -0
app/utils/__pycache__/med42.cpython-312.pyc +0 -0
app/utils/__pycache__/speech_to_text.cpython-312.pyc +0 -0
app/utils/__pycache__/text_to_speech.cpython-312.pyc +0 -0
app/utils/chat_model.py +22 -0
app/utils/med42.py +124 -0
app/utils/speech_to_text.py +92 -0
app/utils/text_to_speech.py +24 -0
environment.yml +211 -0
requirements.txt +10 -0
test_output.wav +3 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test_output.wav filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Gencent
-emoji: 🐢
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 5.15.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: gencent
+app_file: app/main.py
 sdk: gradio
+sdk_version: 5.9.1
 ---

app/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app/.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (146 Bytes). View file

app/config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from dotenv import load_dotenv
+import torch
+load_dotenv()
+class Config:
+    SECRET_KEY = os.getenv('SECRET_KEY', 'your-secret-key')
+    MODEL_PATH = os.getenv('MODEL_PATH', 'mistralai/Mistral-7B-v0.1')
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

app/main.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+import numpy as np
+from utils.speech_to_text import SpeechRecognizer
+from utils.text_to_speech import TextToSpeech
+class VoiceChatApp:
+    def __init__(self):
+        self.speech_recognizer = SpeechRecognizer()
+        self.tts_engine = TextToSpeech()
+        self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Alex?"
+        self.chat_history = []
+    async def welcome_audio(self):
+        """Generate and play the welcome message"""
+        sample_rate, audio_data = await self.tts_engine.synthesize(self.welcome_message)
+        audio_response = (sample_rate, audio_data.astype(np.int16))
+        self.chat_history.append((None, self.welcome_message))
+        return self.chat_history, audio_response
+    async def process_audio(self, audio, history):
+        """Process user audio input and generate response"""
+        if audio is None:
+            return history, (24000, np.zeros(24000, dtype=np.int16)), None
+        # Speech to text
+        text_input = await self.speech_recognizer.transcribe(audio)
+        if not text_input:
+            return history, (24000, np.zeros(24000, dtype=np.int16)), None
+        # Generate response
+        response = "This is a test response. Please confirm if you can hear this clearly."
+        # Text to speech
+        sample_rate, audio_data = await self.tts_engine.synthesize(response)
+        audio_response = (sample_rate, audio_data.astype(np.int16))
+        # Update chat history
+        history.append((text_input, response))
+        return history, audio_response, None
+    def launch(self):
+        """Launch the Gradio interface"""
+        with gr.Blocks(title="Voice-Enabled Chatbot") as interface:
+            with gr.Row():
+                with gr.Column(scale=2):
+                    chatbot = gr.Chatbot(label="Chat History", height=400)
+                    audio_input = gr.Audio(sources=["microphone"], type="numpy",
+                                        label="Speak Here", interactive=True)
+                    audio_output = gr.Audio(label="Assistant Response", autoplay=True, elem_classes="compact-audio")
+            # Initial welcome message
+            interface.load(
+                fn=self.welcome_audio,
+                outputs=[chatbot, audio_output]
+            )
+            # Audio processing chain
+            audio_input.change(
+                fn=self.process_audio,
+                inputs=[audio_input, chatbot],
+                outputs=[chatbot, audio_output, audio_input],
+                api_name="process_audio"
+            ).then(
+                lambda: None,
+                None,
+                audio_input,
+                queue=False
+            )
+        interface.launch(
+            server_name="127.0.0.1",
+            server_port=7860,
+            share=True,
+            debug=True
+        )
+if __name__ == "__main__":
+    app = VoiceChatApp()
+    app.launch()

app/main_legacy.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import gradio as gr
+from utils.speech_to_text import SpeechRecognizer
+from utils.text_to_speech import TextToSpeech
+import numpy as np
+from utils.med42 import Med42
+import time
+class VoiceChatApp:
+    def __init__(self):
+        self.speech_recognizer = SpeechRecognizer()
+        self.tts_engine = TextToSpeech()
+        self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Aleks?"
+        self.chat_history = []  # Maintain persistent chat history
+    async def welcome_audio(self):
+        """Generate and play the welcome message."""
+        tts_output = await self.tts_engine.synthesize(self.welcome_message)
+        # Extract audio data if the TTS returns a tuple (sample_rate, data)
+        if isinstance(tts_output, tuple) and len(tts_output) == 2:
+            _, audio_data = tts_output
+        else:
+            audio_data = tts_output
+        audio_response = self._normalize_audio(audio_data, 24000)
+        self.chat_history.append({"role": "assistant", "content": f"🤖 {self.welcome_message}"})
+        return self.chat_history, audio_response  # Return chat history & audio
+    # Modify the process_audio method to handle TTS errors and add debug prints
+    async def process_audio(self, audio, state):
+        """Process user audio, generate a response, and return updated chat history."""
+        if audio is None:
+            return state, (24000, np.zeros((24000,), dtype=np.int16)), None
+        # Convert speech to text
+        text_input = await self.speech_recognizer.transcribe(audio)
+        if not text_input:
+            print("I am here because recogniser did not work")
+            return state, (24000, np.zeros((24000,), dtype=np.int16)), None
+        # Generate response (simplified for debugging)
+        response = "This is a test response. Please confirm if you can hear this."
+        print(f"TTS Input Text: '{response}'")  # Debug print
+        try:
+            # Attempt TTS synthesis
+            tts_output = await self.tts_engine.synthesize(response)
+            print(f"Raw TTS Output Type: {type(tts_output)}")  # Debug type
+            # Extract audio data
+            if isinstance(tts_output, tuple):
+                sample_rate, audio_data = tts_output
+                print(f"Raw TTS data type: {type(audio_data)}")  # Check container type
+                print(f"Raw TTS dtype: {audio_data.dtype}")  # Check numerical type
+                print(f"Raw TTS min/max: {np.min(audio_data)}, {np.max(audio_data)}")  # Verify range
+                print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
+            else:
+                audio_data = tts_output
+            # Check for zeros in audio data
+            if np.all(audio_data == 0):
+                print("Warning: TTS generated silent audio!")
+        except Exception as e:
+            print(f"TTS Synthesis Error: {e}")
+            audio_data = np.zeros((24000,), dtype=np.float32)
+        # Capture both sample rate and audio data
+        if isinstance(tts_output, tuple) and len(tts_output) == 2:
+            sample_rate, audio_data = tts_output
+        else:
+            sample_rate = 24000  # Fallback
+            audio_data = tts_output
+        # Normalize while preserving sample rate
+        audio_response = self._normalize_audio(audio_data, sample_rate)
+        # Update chat history and return
+        messages = [
+            {"role": "user", "content": f"🎤 User said: {text_input}"},
+            {"role": "assistant", "content": f"🤖 {response}"}
+        ]
+        state = state.copy() if state else []
+        state.extend(messages)
+        print(f"Final audio response - SR: {audio_response[0]}, Shape: {audio_response[1].shape}, Dtype: {audio_response[1].dtype}")
+        print(f"Final audio peaks: {np.max(np.abs(audio_response[1]))}")  # Should be > 0
+        audio_response = (audio_response[0], audio_response[1], str(time.time()))
+        return state, audio_response, None
+    def _normalize_audio(self, audio_array, sample_rate):
+        """Final format adjustment for Gradio compatibility"""
+        # Keep previous processing steps
+        if audio_array.dtype != np.int16:
+            audio_array = audio_array.astype(np.int16)
+        # Ensure 2D shape for mono audio (samples, 1)
+        if audio_array.ndim == 1:
+            audio_array = audio_array.reshape(-1, 1)
+        # Convert to 1D if mono
+        if audio_array.ndim == 2 and audio_array.shape[1] == 1:
+            audio_array = audio_array.flatten()  # Shape becomes (N,)
+        return (sample_rate, audio_array)
+    # The rest of your code (launch method, etc.) remains unchanged
+    def launch(self):
+        """Launch the Gradio interface with audio refresh workaround"""
+        with gr.Blocks(title="Voice-Enabled Chatbot", css=".autoplay-audio { display: none }") as interface:
+            with gr.Row():
+                with gr.Column(scale=2):
+                    chatbot = gr.Chatbot(
+                        label="Chat History",
+                        type="messages",
+                        height=400
+                    )
+                    audio_input = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        label="Speak Here",
+                        interactive=True  # Ensure it stays interactive
+                    )
+                    audio_output = gr.Audio(
+                        label="Assistant Response",
+                        autoplay=True,
+                        format="wav",  # Explicit format
+                        elem_id="audio-output"  # Add ID for JS control
+                    )
+                    # Add JavaScript to force audio reload
+                    interface.load(
+                        None,
+                        None,
+                        None,
+                        _js="""
+                        () => {
+                            function reloadAudio() {
+                                const audio = document.querySelector('#audio-output audio');
+                                if (audio) {
+                                    const source = audio.querySelector('source');
+                                    if (source) {
+                                        source.src += '#' + Date.now();
+                                        audio.load();
+                                    }
+                                }
+                            }
+                            setInterval(reloadAudio, 500);
+                        }
+                        """
+                    )
+                # State for managing chat history
+                state = gr.State([])
+            # On page load, play welcome message and show initial chat history
+            interface.load(
+                fn=self.welcome_audio,
+                outputs=[chatbot, audio_output]
+            )
+            # When user speaks, process audio and update the chat
+            audio_input.change(
+                fn=self.process_audio,
+                inputs=[audio_input, state],
+                outputs=[chatbot, audio_output, audio_input],
+                api_name="process_audio"
+            ).then(
+                lambda: None,
+                None,
+                audio_input,
+                queue=False
+            )
+        interface.launch(
+            server_name="127.0.0.1",
+            server_port=7860,
+            share=True,
+            debug=True
+        )
+if __name__ == "__main__":
+    app = VoiceChatApp()
+    app.launch()

app/scratch.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import asyncio
+from utils.text_to_speech import TextToSpeech  # Update with your actual module path
+import soundfile as sf
+async def main():
+    tts = TextToSpeech()
+    text = "This is a test voice output"
+    # Get audio data
+    sample_rate, audio_data = await tts.synthesize(text)
+    # Save to file (WAV format)
+    sf.write("test_output.wav", audio_data, sample_rate)
+    print("Saved test_output.wav")
+if __name__ == "__main__":
+    asyncio.run(main())

app/utils/__init__.py ADDED Viewed

File without changes

app/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (152 Bytes). View file

app/utils/__pycache__/chat_model.cpython-312.pyc ADDED Viewed

Binary file (1.61 kB). View file

app/utils/__pycache__/med42.cpython-312.pyc ADDED Viewed

Binary file (5.42 kB). View file

app/utils/__pycache__/speech_to_text.cpython-312.pyc ADDED Viewed

Binary file (5.38 kB). View file

app/utils/__pycache__/text_to_speech.cpython-312.pyc ADDED Viewed

Binary file (1.58 kB). View file

app/utils/chat_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+class ChatModel:
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", token=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1",
+            torch_dtype=torch.float16,
+            token=True
+        )
+    async def generate_response(self, input_text):
+        inputs = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)
+        outputs = self.model.generate(
+            **inputs,
+            max_length=100,
+            num_return_sequences=1,
+            temperature=0.7
+        )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response

app/utils/med42.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Import relevant packages
+import requests
+import yaml
+from typing import List, Union
+from pathlib import Path
+class Med42():
+    """
+    A class for interacting with the Med42 API
+    """
+    def __init__(self, base_endpoint: str = "https://dev-openai-api.med42.ai/",
+                 prompt_filepath: Union[str, Path, None] = "./prompts.yaml"):
+        """
+        Initialise the Med42 API caller object
+        Args:
+            base_endpoint: URL address API calls are hosted. Defaults to "http://dev-openai-api.med42.ai/".
+            prompt_filepath: Filepath to prompts catalogue in yaml format. Defaults to "./prompts.yaml".
+        """
+        self.base_endpoint = base_endpoint
+        self.docs_endpoint = self.base_endpoint + "docs"
+        # self.available_models = self._list_model_ids()
+        self.prompt_catalogue = load_yaml(prompt_filepath) if prompt_filepath is not None else None
+    def __repr__(self):
+        """
+        Returns: Description of this object class including a link to API docs page
+        """
+        return f"Med42 API calls available, see docs at {self.docs_endpoint}"
+    def list_models(self) -> List[dict]:
+        """
+        Pull information on all the models currently hosted on API page
+        Returns:
+            List of models available and their details
+        """
+        try:
+            response = requests.get(self.base_endpoint + "v1/models")
+            response.raise_for_status()  # raise an exception for HTTP errors
+            return response.json()['data']
+        except requests.RequestException as e:
+            print("Error: ", e)
+    def _list_model_ids(self) -> List[str]:
+        """
+        Returns: List of currently available model ids
+        """
+        return [model_entry['id'] for model_entry in self.list_models()]
+    def _update_prompt_catalogue(self, prompt_filepath: Union[str, Path] = "./prompts.yaml") -> dict:
+        """
+        Read in catalogue of prompts (recorded in a yaml file) and update attribute
+        Args:
+            prompt_filepath: filepath to catalogue of prompts to be read. Defaults to "./prompts.yaml"
+        """
+        self.prompt_catalogue = load_yaml(prompt_filepath)
+        print(f"Prompt catalogue updated to be the prompts from {prompt_filepath}")
+    def chat_completion(self, system_instruct: str, user_instruct: str, llm_model: str = "Llama3-Med42-70B-32k",
+                        temp: float = 0.7, generate_log: bool = False) -> str:
+        """
+        Uses chat completion functionality
+        Args:
+            system_instruct: System instruction
+            user_instruct: User instruction
+            llm_model:  Language model to use. Defaults to "Mixtral-8x7B-Instruct-v0.1"
+            temp: Temperature parameter for generating responses. Defaults to 0.7
+            generate_log: Whether to log the response or not. Defaults to False
+        Returns:
+            LLM output if log is False, otherwise a dictionary containing system instruction, user instruction, and LLM call response
+        """
+        return "hardcoded response because Med42 is not working I don't think. Just making it longer"
+        data = {"model": llm_model, "messages": [{"role": "system", "content": system_instruct},
+                                                 {"role": "user", "content": f"{user_instruct}"}], "temperature": temp}
+        headers =  {"Content-Type": "application/json"}
+        try:
+            response = requests.post(self.base_endpoint + "v1/chat/completions", json=data, headers=headers)
+            response.raise_for_status()
+            if not generate_log:
+                return response.json()["choices"][0]["message"]["content"]  # LLM chat completion output
+            else:
+                # LLM logged response plus chat completion output (tuple outputted)
+                return {"system_instruction": system_instruct, "user_instruction": user_instruct, "llm_call_response": response.json()}, response.json()["choices"][0]["message"]["content"]
+        except requests.RequestException as e:
+            print("Error: ", e)
+# Util functions
+def load_yaml(yaml_filepath: str | Path) -> dict:
+    """
+    Read in YAML files
+    Args:
+        yaml_filepath: filepath to yaml file to be read
+    Returns:
+        Contents of the YAML file
+    """
+    try:
+        with open(yaml_filepath, 'r') as file:
+            data = yaml.safe_load(file)
+        return data
+    except FileNotFoundError:
+        raise FileNotFoundError(f"File not found: {yaml_filepath}")
+    except IOError as e:
+        raise IOError(f"Error reading file: {e}")
+    except yaml.YAMLError as e:
+        raise ValueError(f"Error parsing YAML file: {e}")
+if __name__ == "__main__":
+    llm_caller = Med42()

app/utils/speech_to_text.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import numpy as np
+import torch
+import librosa
+class ASRConfig:
+    """Configuration class for ASR transcription."""
+    def __init__(
+        self,
+        model_id="openai/whisper-large-v2",
+        language="english",
+        sampling_rate=16000,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    ):
+        self.model_id = model_id
+        self.language = language
+        self.sampling_rate = sampling_rate
+        self.device = device
+        self.torch_dtype = torch_dtype
+class SpeechRecognizer:
+    def __init__(self, config: ASRConfig = None):
+        self.config = config if config else ASRConfig()
+        print(f"Using ASR configuration: {self.config.__dict__}")
+        self._setup_model()
+    def _setup_model(self):
+        """Initialize Whisper model and processor."""
+        try:
+            self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                self.config.model_id,
+                torch_dtype=self.config.torch_dtype,
+                use_safetensors=True,
+            ).to(self.config.device)
+            self.processor = AutoProcessor.from_pretrained(self.config.model_id)
+            self.pipe = pipeline(
+                "automatic-speech-recognition",
+                model=self.model,
+                tokenizer=self.processor.tokenizer,
+                feature_extractor=self.processor.feature_extractor,
+                torch_dtype=self.config.torch_dtype,
+                device=self.config.device,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to set up Whisper model: {str(e)}")
+    async def transcribe(self, audio: tuple, prompt: str = None) -> str:
+        """
+        Transcribes the provided audio using the Whisper pipeline.
+        Args:
+            audio (tuple): A tuple containing (sample_rate, audio_array).
+            prompt (str): An optional text prompt to guide transcription.
+        Returns:
+            str: Transcription of the audio.
+        """
+        if not audio or len(audio) != 2:
+            raise ValueError("Invalid audio input. Expected a tuple (sample_rate, audio_array).")
+        try:
+            # Extract the raw audio data (audio_array) from the input tuple
+            sample_rate, audio_array = audio
+            # Ensure the audio is a numpy array and has the expected format
+            if not isinstance(audio_array, np.ndarray):
+                raise TypeError(f"Expected numpy.ndarray for audio data, got {type(audio_array)}")
+            # Ensure the audio array is in floating-point format
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32) / np.iinfo(audio_array.dtype).max
+            # Resample audio if the sample rate differs from the configured rate
+            if sample_rate != self.config.sampling_rate:
+                import librosa
+                audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=self.config.sampling_rate)
+            # Prepare generate_kwargs for the pipeline
+            generate_kwargs = {}
+            if self.config.language:
+                generate_kwargs["language"] = self.config.language
+            if prompt:
+                prompt_ids = self.processor.get_prompt_ids(prompt, return_tensors="pt").to(self.config.device)
+                generate_kwargs["prompt_ids"] = prompt_ids
+            # Run transcription through the pipeline
+            result = self.pipe(audio_array, generate_kwargs=generate_kwargs)
+            return result["text"].strip()
+        except Exception as e:
+            raise RuntimeError(f"Transcription failed: {str(e)}")

app/utils/text_to_speech.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import edge_tts
+import io
+import numpy as np
+from pydub import AudioSegment  # Install with: pip install pydub
+class TextToSpeech:
+    def __init__(self, voice="en-US-AriaNeural"):
+        self.voice = voice
+    async def synthesize(self, text):
+        communicate = edge_tts.Communicate(text, self.voice)
+        # Collect raw audio bytes
+        audio_bytes = bytearray()
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                audio_bytes.extend(chunk["data"])
+        # Convert to numpy array using pydub
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
+        samples = np.array(audio.get_array_of_samples())
+        sample_rate = audio.frame_rate
+        return (sample_rate, samples)

environment.yml ADDED Viewed

	@@ -0,0 +1,211 @@

+name: gencent_env
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - aiohappyeyeballs=2.4.4=py312hca03da5_0
+  - aiohttp=3.11.10=py312h80987f9_0
+  - aiosignal=1.2.0=pyhd3eb1b0_0
+  - altair=5.0.1=py312hca03da5_0
+  - arrow-cpp=16.1.0=hbc20fb2_0
+  - attrs=24.2.0=py312hca03da5_0
+  - aws-c-auth=0.6.19=h80987f9_0
+  - aws-c-cal=0.5.20=h80987f9_0
+  - aws-c-common=0.8.5=h80987f9_0
+  - aws-c-compression=0.2.16=h80987f9_0
+  - aws-c-event-stream=0.2.15=h313beb8_0
+  - aws-c-http=0.6.25=h80987f9_0
+  - aws-c-io=0.13.10=h80987f9_0
+  - aws-c-mqtt=0.7.13=h80987f9_0
+  - aws-c-s3=0.1.51=h80987f9_0
+  - aws-c-sdkutils=0.1.6=h80987f9_0
+  - aws-checksums=0.1.13=h80987f9_0
+  - aws-crt-cpp=0.18.16=h313beb8_0
+  - aws-sdk-cpp=1.10.55=h313beb8_0
+  - blas=1.0=openblas
+  - blinker=1.6.2=py312hca03da5_0
+  - boost-cpp=1.82.0=h48ca7d4_2
+  - bottleneck=1.4.2=py312ha86b861_0
+  - brotli-python=1.0.9=py312h313beb8_8
+  - bzip2=1.0.8=h80987f9_6
+  - c-ares=1.19.1=h80987f9_0
+  - ca-certificates=2024.11.26=hca03da5_0
+  - cachetools=5.3.3=py312hca03da5_0
+  - certifi=2024.12.14=py312hca03da5_0
+  - cffi=1.17.1=py312h3eb5a62_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - click=8.1.7=py312hca03da5_0
+  - datasets=3.2.0=pyhd8ed1ab_0
+  - dill=0.3.8=py312hca03da5_0
+  - filelock=3.13.1=py312hca03da5_0
+  - freetype=2.12.1=h1192e45_0
+  - frozenlist=1.5.0=py312h80987f9_0
+  - fsspec=2024.6.1=py312hca03da5_0
+  - gettext=0.22.5=h8414b35_3
+  - gettext-tools=0.22.5=h8414b35_3
+  - gflags=2.2.2=h313beb8_1
+  - gitdb=4.0.7=pyhd3eb1b0_0
+  - gitpython=3.1.43=py312hca03da5_0
+  - glog=0.5.0=h313beb8_1
+  - huggingface_hub=0.26.5=pyhd8ed1ab_1
+  - icu=73.1=h313beb8_0
+  - idna=3.7=py312hca03da5_0
+  - jinja2=3.1.4=py312hca03da5_1
+  - jpeg=9e=h80987f9_3
+  - jsonschema=4.23.0=py312hca03da5_0
+  - jsonschema-specifications=2023.7.1=py312hca03da5_0
+  - krb5=1.20.1=hf3e1bf2_1
+  - lame=3.100=h1a28f6b_0
+  - lcms2=2.12=hba8e193_0
+  - lerc=3.0=hc377ac9_0
+  - libabseil=20240116.2=cxx17_h313beb8_0
+  - libasprintf=0.22.5=h8414b35_3
+  - libasprintf-devel=0.22.5=h8414b35_3
+  - libboost=1.82.0=h0bc93f9_2
+  - libbrotlicommon=1.0.9=h80987f9_8
+  - libbrotlidec=1.0.9=h80987f9_8
+  - libbrotlienc=1.0.9=h80987f9_8
+  - libcurl=8.9.1=h3e2b118_0
+  - libcxx=19.1.6=ha82da77_1
+  - libdeflate=1.17=h80987f9_1
+  - libedit=3.1.20230828=h80987f9_0
+  - libev=4.33=h1a28f6b_1
+  - libevent=2.1.12=h02f6b3c_1
+  - libexpat=2.6.3=hf9b8971_0
+  - libffi=3.4.4=hca03da5_1
+  - libflac=1.4.3=hb765f3a_0
+  - libgettextpo=0.22.5=h8414b35_3
+  - libgettextpo-devel=0.22.5=h8414b35_3
+  - libgfortran=5.0.0=11_3_0_hca03da5_28
+  - libgfortran5=11.3.0=h009349e_28
+  - libgrpc=1.62.2=h62f6fdd_0
+  - libiconv=1.17=h0d3ecfb_2
+  - libintl=0.22.5=h8414b35_3
+  - libintl-devel=0.22.5=h8414b35_3
+  - libnghttp2=1.57.0=h62f6fdd_0
+  - libogg=1.3.5=h1a28f6b_1
+  - libopenblas=0.3.21=h269037a_0
+  - libopus=1.3.1=h80987f9_1
+  - libpng=1.6.39=h80987f9_0
+  - libprotobuf=4.25.3=h514c7bf_0
+  - libsndfile=1.2.2=h9739721_1
+  - libsqlite=3.46.0=hfb93653_0
+  - libssh2=1.11.0=h3e2b118_0
+  - libthrift=0.15.0=h73c2103_2
+  - libtiff=4.5.1=h313beb8_0
+  - libvorbis=1.3.7=h1a28f6b_0
+  - libwebp-base=1.3.2=h80987f9_1
+  - libzlib=1.2.13=hfb2fe0b_6
+  - llvm-openmp=14.0.6=hc6e5704_0
+  - lz4-c=1.9.4=h313beb8_1
+  - markdown-it-py=2.2.0=py312hca03da5_1
+  - markupsafe=2.1.3=py312h80987f9_0
+  - mdurl=0.1.0=py312hca03da5_0
+  - mpg123=1.32.9=hf642e45_0
+  - mpmath=1.3.0=py312hca03da5_0
+  - multidict=6.1.0=py312h80987f9_0
+  - multiprocess=0.70.15=py312hca03da5_0
+  - ncurses=6.4=h313beb8_0
+  - networkx=3.3=py312hca03da5_0
+  - numexpr=2.10.1=py312h5d9532f_0
+  - numpy=1.26.4=py312h7f4fdc5_0
+  - numpy-base=1.26.4=py312he047099_0
+  - openjpeg=2.5.2=h54b8e55_0
+  - openssl=3.4.0=h39f12f2_0
+  - orc=2.0.1=h937ddfc_0
+  - packaging=24.1=py312hca03da5_0
+  - pandas=2.2.2=py312hd77ebd4_0
+  - pillow=10.4.0=py312h80987f9_0
+  - pip=24.2=py312hca03da5_0
+  - portaudio=19.7.0=h5833ebf_0
+  - propcache=0.2.0=py312h80987f9_0
+  - protobuf=4.25.3=py312h8472c4a_0
+  - pyarrow=16.1.0=py312hd77ebd4_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pydeck=0.8.0=py312hca03da5_2
+  - pygments=2.15.1=py312hca03da5_1
+  - pysocks=1.7.1=py312hca03da5_0
+  - pysoundfile=0.12.1=pyhd8ed1ab_3
+  - python=3.12.2=hdf0ec26_0_cpython
+  - python-dateutil=2.9.0post0=py312hca03da5_2
+  - python-dotenv=1.0.1=pyhd8ed1ab_1
+  - python-sounddevice=0.5.0=pyhd8ed1ab_0
+  - python-tzdata=2023.3=pyhd3eb1b0_0
+  - python-xxhash=2.0.2=py312h80987f9_1
+  - python_abi=3.12=5_cp312
+  - pytorch=2.5.1=py3.12_0
+  - pytz=2024.1=py312hca03da5_0
+  - pyyaml=6.0.2=py312h80987f9_0
+  - re2=2022.04.01=hc377ac9_0
+  - readline=8.2=h1a28f6b_0
+  - referencing=0.30.2=py312hca03da5_0
+  - regex=2024.9.11=py312h80987f9_0
+  - requests=2.32.3=py312hca03da5_1
+  - rich=13.7.1=py312hca03da5_0
+  - rpds-py=0.10.6=py312h2aea54e_1
+  - safetensors=0.4.5=py312h7805bc0_1
+  - setuptools=75.1.0=py312hca03da5_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - smmap=4.0.0=pyhd3eb1b0_0
+  - snappy=1.2.1=h313beb8_0
+  - sqlite=3.45.3=h80987f9_0
+  - streamlit=1.38.0=py312hca03da5_0
+  - tenacity=8.2.3=py312hca03da5_0
+  - tk=8.6.14=h6ba3021_0
+  - tokenizers=0.21.0=py312hf3e4074_0
+  - toml=0.10.2=pyhd3eb1b0_0
+  - toolz=0.12.0=py312hca03da5_0
+  - tornado=6.4.1=py312h80987f9_0
+  - tqdm=4.66.5=py312h989b03a_0
+  - transformers=4.47.1=pyhd8ed1ab_0
+  - tzdata=2024b=h04d1e81_0
+  - urllib3=2.2.3=py312hca03da5_0
+  - utf8proc=2.6.1=h80987f9_1
+  - wheel=0.44.0=py312hca03da5_0
+  - xxhash=0.8.0=h1a28f6b_3
+  - xz=5.4.6=h80987f9_1
+  - yaml=0.2.5=h1a28f6b_0
+  - yarl=1.18.0=py312h80987f9_0
+  - zlib=1.2.13=hfb2fe0b_6
+  - zstd=1.5.6=hfb09047_0
+  - pip:
+      - aiofiles==23.2.1
+      - annotated-types==0.7.0
+      - anyio==4.7.0
+      - edge-tts==7.0.0
+      - fastapi==0.115.6
+      - ffmpy==0.5.0
+      - gradio==5.9.1
+      - gradio-client==1.5.2
+      - h11==0.14.0
+      - httpcore==1.0.7
+      - httpx==0.28.1
+      - hyperpyyaml==1.2.2
+      - joblib==1.4.2
+      - orjson==3.10.13
+      - pydantic==2.10.4
+      - pydantic-core==2.27.2
+      - pydub==0.25.1
+      - python-multipart==0.0.20
+      - ruamel-yaml==0.18.7
+      - ruamel-yaml-clib==0.2.12
+      - ruff==0.8.4
+      - safehttpx==0.1.6
+      - scipy==1.14.1
+      - semantic-version==2.10.0
+      - sentencepiece==0.2.0
+      - shellingham==1.5.4
+      - sniffio==1.3.1
+      - speechbrain==1.0.2
+      - srt==3.5.3
+      - starlette==0.41.3
+      - sympy==1.13.1
+      - tabulate==0.9.0
+      - tomlkit==0.13.2
+      - torchaudio==2.5.1
+      - typer==0.15.1
+      - typing-extensions==4.12.2
+      - uvicorn==0.34.0
+      - websockets==14.1
+prefix: /Users/hamza/miniconda3/envs/gencent_env

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Not sure version numbers here are actually correct
+gradio==3.50.2
+speechbrain==1.0.2
+transformers==4.21.0
+python-dotenv==0.19.0
+sounddevice==0.4.3
+soundfile==0.10.3.post1
+edge-tts==4.0.0
+# flask==2.0.1
+torch==2.3.1

test_output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f1260f6a6c0583fec3824fc8851c44c1adf2ac7b630fa09d9cb38dc65b5286c
+size 130220