tommytracx commited on
Commit
b0aa032
·
verified ·
1 Parent(s): 233531a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -26
app.py CHANGED
@@ -2,14 +2,17 @@ import gradio as gr
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
- from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
  from urllib.request import urlopen
7
  import spaces
 
 
 
 
 
8
 
9
- # Define model path
10
  model_path = "microsoft/Phi-4-multimodal-instruct"
11
 
12
- # Load model and processor
13
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
14
  model = AutoModelForCausalLM.from_pretrained(
15
  model_path,
@@ -19,32 +22,46 @@ model = AutoModelForCausalLM.from_pretrained(
19
  _attn_implementation="eager",
20
  )
21
 
22
- # Define prompt structure
 
 
 
23
  user_prompt = '<|user|>'
24
  assistant_prompt = '<|assistant|>'
25
  prompt_suffix = '<|end|>'
26
 
27
- # Define inference function
 
 
 
28
  @spaces.GPU
29
  def process_input(input_type, file, question):
30
  if not file or not question:
31
  return "Please upload a file and provide a question."
32
 
33
- # Prepare the prompt
34
  if input_type == "Image":
35
  prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
36
- # Open image from uploaded file
37
- image = Image.open(file)
 
 
 
38
  inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
 
39
  elif input_type == "Audio":
40
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
- # Read audio from uploaded file
42
- audio, samplerate = sf.read(file)
 
 
 
43
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
 
44
  else:
45
  return "Invalid input type selected."
46
 
47
- # Generate response
48
  with torch.no_grad():
49
  generate_ids = model.generate(
50
  **inputs,
@@ -58,7 +75,10 @@ def process_input(input_type, file, question):
58
 
59
  return response
60
 
61
- # Gradio interface
 
 
 
62
  with gr.Blocks(
63
  title="Demo of how GABI could use a Multimodal",
64
  theme=gr.themes.Soft(
@@ -67,14 +87,29 @@ with gr.Blocks(
67
  radius_size="lg",
68
  ),
69
  ) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  gr.Markdown(
71
  """
72
- # This Space is using Phi-4 as the LLM for the Multimodal Demo
73
- Try uploading an **image** or **audio** file, ask a question, and get a response from the model!
74
- We want to leverage this to allow GABI to have the ability to interact and understand various contents.
75
  """
76
  )
77
-
78
  with gr.Row():
79
  with gr.Column(scale=1):
80
  input_type = gr.Radio(
@@ -88,7 +123,7 @@ with gr.Blocks(
88
  )
89
  question_input = gr.Textbox(
90
  label="Your Question",
91
- placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
92
  lines=2,
93
  )
94
  submit_btn = gr.Button("Submit", variant="primary")
@@ -96,31 +131,33 @@ with gr.Blocks(
96
  with gr.Column(scale=2):
97
  output_text = gr.Textbox(
98
  label="Gabi's Response",
99
- placeholder="Gabi's response will appear here...",
100
  lines=10,
101
  interactive=False,
102
  )
103
 
104
- # Example section
105
  with gr.Accordion("Examples", open=False):
106
- gr.Markdown("Try these examples:")
107
  gr.Examples(
108
  examples=[
109
- ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
110
- ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
111
  ],
112
  inputs=[input_type, file_input, question_input],
113
- outputs=output_text,
114
- fn=process_input,
115
  cache_examples=False,
116
  )
117
 
118
- # Connect the submit button
119
  submit_btn.click(
120
  fn=process_input,
121
  inputs=[input_type, file_input, question_input],
122
  outputs=output_text,
123
  )
124
 
125
- # Launch the demo
 
 
 
126
  demo.launch()
 
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
+ from transformers import AutoModelForCausalLM, AutoProcessor
6
  from urllib.request import urlopen
7
  import spaces
8
+ import os
9
+
10
+ # ==============================
11
+ # Model and Processor Loading
12
+ # ==============================
13
 
 
14
  model_path = "microsoft/Phi-4-multimodal-instruct"
15
 
 
16
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
17
  model = AutoModelForCausalLM.from_pretrained(
18
  model_path,
 
22
  _attn_implementation="eager",
23
  )
24
 
25
+ # ==============================
26
+ # Prompt Templates
27
+ # ==============================
28
+
29
  user_prompt = '<|user|>'
30
  assistant_prompt = '<|assistant|>'
31
  prompt_suffix = '<|end|>'
32
 
33
+ # ==============================
34
+ # Inference Function
35
+ # ==============================
36
+
37
  @spaces.GPU
38
  def process_input(input_type, file, question):
39
  if not file or not question:
40
  return "Please upload a file and provide a question."
41
 
42
+ # Prepare the multimodal prompt
43
  if input_type == "Image":
44
  prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
45
+ # Handle file or URL
46
+ if isinstance(file, str) and file.startswith("http"):
47
+ image = Image.open(urlopen(file))
48
+ else:
49
+ image = Image.open(file.name if hasattr(file, "name") else file)
50
  inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
51
+
52
  elif input_type == "Audio":
53
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
54
+ if isinstance(file, str) and file.startswith("http"):
55
+ audio_file = urlopen(file)
56
+ audio, samplerate = sf.read(audio_file)
57
+ else:
58
+ audio, samplerate = sf.read(file.name if hasattr(file, "name") else file)
59
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
60
+
61
  else:
62
  return "Invalid input type selected."
63
 
64
+ # Generate the response
65
  with torch.no_grad():
66
  generate_ids = model.generate(
67
  **inputs,
 
75
 
76
  return response
77
 
78
+ # ==============================
79
+ # Gradio UI Setup
80
+ # ==============================
81
+
82
  with gr.Blocks(
83
  title="Demo of how GABI could use a Multimodal",
84
  theme=gr.themes.Soft(
 
87
  radius_size="lg",
88
  ),
89
  ) as demo:
90
+
91
+ # Insert Simli FaceTime Widget
92
+ gr.HTML(
93
+ """
94
+ <simli-widget
95
+ token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U"
96
+ agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2"
97
+ position="right"
98
+ customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif"
99
+ customtext="FaceTime GABI"
100
+ ></simli-widget>
101
+ <script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script>
102
+ """
103
+ )
104
+
105
+ # Header
106
  gr.Markdown(
107
  """
108
+ # Multimodal Demo - Powered by GABI using Phi-4
109
+ Upload an **image** or **audio** file, ask a question, and GABI will respond intelligently!
 
110
  """
111
  )
112
+
113
  with gr.Row():
114
  with gr.Column(scale=1):
115
  input_type = gr.Radio(
 
123
  )
124
  question_input = gr.Textbox(
125
  label="Your Question",
126
+ placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
127
  lines=2,
128
  )
129
  submit_btn = gr.Button("Submit", variant="primary")
 
131
  with gr.Column(scale=2):
132
  output_text = gr.Textbox(
133
  label="Gabi's Response",
134
+ placeholder="Gabi's answer will appear here...",
135
  lines=10,
136
  interactive=False,
137
  )
138
 
139
+ # Example Usage
140
  with gr.Accordion("Examples", open=False):
141
+ gr.Markdown("Fill the fields using an example, then click **Submit** manually:")
142
  gr.Examples(
143
  examples=[
144
+ ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
145
+ ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
146
  ],
147
  inputs=[input_type, file_input, question_input],
148
+ outputs=None,
 
149
  cache_examples=False,
150
  )
151
 
152
+ # Submit Button Binding
153
  submit_btn.click(
154
  fn=process_input,
155
  inputs=[input_type, file_input, question_input],
156
  outputs=output_text,
157
  )
158
 
159
+ # ==============================
160
+ # Launch App
161
+ # ==============================
162
+
163
  demo.launch()