Spaces:

huro-ai
/

GABI-Multimodal-Demo

Sleeping

App Files Files Community

tommytracx commited on Apr 29

Commit

b0aa032

verified ·

1 Parent(s): 233531a

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -26

app.py CHANGED Viewed

@@ -2,14 +2,17 @@ import gradio as gr
 from PIL import Image
 import torch
 import soundfile as sf
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 from urllib.request import urlopen
 import spaces
-# Define model path
 model_path = "microsoft/Phi-4-multimodal-instruct"
-# Load model and processor
 processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
@@ -19,32 +22,46 @@ model = AutoModelForCausalLM.from_pretrained(
     _attn_implementation="eager",
 )
-# Define prompt structure
 user_prompt = '<|user|>'
 assistant_prompt = '<|assistant|>'
 prompt_suffix = '<|end|>'
-# Define inference function
 @spaces.GPU
 def process_input(input_type, file, question):
     if not file or not question:
         return "Please upload a file and provide a question."
-    # Prepare the prompt
     if input_type == "Image":
         prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
-        # Open image from uploaded file
-        image = Image.open(file)
         inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
     elif input_type == "Audio":
         prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
-        # Read audio from uploaded file
-        audio, samplerate = sf.read(file)
         inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
     else:
         return "Invalid input type selected."
-    # Generate response
     with torch.no_grad():
         generate_ids = model.generate(
             **inputs,
@@ -58,7 +75,10 @@ def process_input(input_type, file, question):
     return response
-# Gradio interface
 with gr.Blocks(
     title="Demo of how GABI could use a Multimodal",
     theme=gr.themes.Soft(
@@ -67,14 +87,29 @@ with gr.Blocks(
         radius_size="lg",
     ),
 ) as demo:
     gr.Markdown(
         """
-        # This Space is using Phi-4 as the LLM for the Multimodal Demo
-        Try uploading an **image** or **audio** file, ask a question, and get a response from the model!
-        We want to leverage this to allow GABI to have the ability to interact and understand various contents.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             input_type = gr.Radio(
@@ -88,7 +123,7 @@ with gr.Blocks(
             )
             question_input = gr.Textbox(
                 label="Your Question",
-                placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
                 lines=2,
             )
             submit_btn = gr.Button("Submit", variant="primary")
@@ -96,31 +131,33 @@ with gr.Blocks(
         with gr.Column(scale=2):
             output_text = gr.Textbox(
                 label="Gabi's Response",
-                placeholder="Gabi's response will appear here...",
                 lines=10,
                 interactive=False,
             )
-    # Example section
     with gr.Accordion("Examples", open=False):
-        gr.Markdown("Try these examples:")
         gr.Examples(
             examples=[
-                ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
-                ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
             ],
             inputs=[input_type, file_input, question_input],
-            outputs=output_text,
-            fn=process_input,
             cache_examples=False,
         )
-    # Connect the submit button
     submit_btn.click(
         fn=process_input,
         inputs=[input_type, file_input, question_input],
         outputs=output_text,
     )
-# Launch the demo
 demo.launch()

 from PIL import Image
 import torch
 import soundfile as sf
+from transformers import AutoModelForCausalLM, AutoProcessor
 from urllib.request import urlopen
 import spaces
+import os
+# ==============================
+# Model and Processor Loading
+# ==============================
 model_path = "microsoft/Phi-4-multimodal-instruct"
 processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
     _attn_implementation="eager",
 )
+# ==============================
+# Prompt Templates
+# ==============================
 user_prompt = '<|user|>'
 assistant_prompt = '<|assistant|>'
 prompt_suffix = '<|end|>'
+# ==============================
+# Inference Function
+# ==============================
 @spaces.GPU
 def process_input(input_type, file, question):
     if not file or not question:
         return "Please upload a file and provide a question."
+    # Prepare the multimodal prompt
     if input_type == "Image":
         prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
+        # Handle file or URL
+        if isinstance(file, str) and file.startswith("http"):
+            image = Image.open(urlopen(file))
+        else:
+            image = Image.open(file.name if hasattr(file, "name") else file)
         inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
     elif input_type == "Audio":
         prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
+        if isinstance(file, str) and file.startswith("http"):
+            audio_file = urlopen(file)
+            audio, samplerate = sf.read(audio_file)
+        else:
+            audio, samplerate = sf.read(file.name if hasattr(file, "name") else file)
         inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
     else:
         return "Invalid input type selected."
+    # Generate the response
     with torch.no_grad():
         generate_ids = model.generate(
             **inputs,
     return response
+# ==============================
+# Gradio UI Setup
+# ==============================
 with gr.Blocks(
     title="Demo of how GABI could use a Multimodal",
     theme=gr.themes.Soft(
         radius_size="lg",
     ),
 ) as demo:
+    # Insert Simli FaceTime Widget
+    gr.HTML(
+        """
+        <simli-widget
+            token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U"
+            agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2"
+            position="right"
+            customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif"
+            customtext="FaceTime GABI"
+        ></simli-widget>
+        <script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script>
+        """
+    )
+    # Header
     gr.Markdown(
         """
+        # Multimodal Demo - Powered by GABI using Phi-4
+        Upload an **image** or **audio** file, ask a question, and GABI will respond intelligently!
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             input_type = gr.Radio(
             )
             question_input = gr.Textbox(
                 label="Your Question",
+                placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
                 lines=2,
             )
             submit_btn = gr.Button("Submit", variant="primary")
         with gr.Column(scale=2):
             output_text = gr.Textbox(
                 label="Gabi's Response",
+                placeholder="Gabi's answer will appear here...",
                 lines=10,
                 interactive=False,
             )
+    # Example Usage
     with gr.Accordion("Examples", open=False):
+        gr.Markdown("Fill the fields using an example, then click **Submit** manually:")
         gr.Examples(
             examples=[
+                ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
+                ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
             ],
             inputs=[input_type, file_input, question_input],
+            outputs=None,
             cache_examples=False,
         )
+    # Submit Button Binding
     submit_btn.click(
         fn=process_input,
         inputs=[input_type, file_input, question_input],
         outputs=output_text,
     )
+# ==============================
+# Launch App
+# ==============================
 demo.launch()