AustingDong
commited on
Commit
·
09b0453
1
Parent(s):
3912684
correct terms
Browse files
app.py
CHANGED
@@ -50,7 +50,7 @@ def clean():
|
|
50 |
# Multimodal Understanding function
|
51 |
@spaces.GPU(duration=120)
|
52 |
def multimodal_understanding(model_type,
|
53 |
-
|
54 |
visual_pooling_method,
|
55 |
image, question, seed, top_p, temperature, target_token_idx,
|
56 |
visualization_layer_min, visualization_layer_max, focus, response_type):
|
@@ -72,7 +72,7 @@ def multimodal_understanding(model_type,
|
|
72 |
inputs = clip_utils.prepare_inputs([question], image)
|
73 |
|
74 |
|
75 |
-
if
|
76 |
# Generate Grad-CAM
|
77 |
all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
|
78 |
if visualization_layers_min.value != visualization_layers_max.value:
|
@@ -117,7 +117,7 @@ def multimodal_understanding(model_type,
|
|
117 |
else:
|
118 |
start = 512
|
119 |
|
120 |
-
if
|
121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
122 |
if focus == "Visual Encoder":
|
123 |
all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
|
@@ -181,7 +181,7 @@ def model_slider_change(model_type):
|
|
181 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
|
182 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
|
183 |
gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
|
184 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
185 |
)
|
186 |
return res
|
187 |
elif model_type.split('-')[0] == "Janus":
|
@@ -198,7 +198,7 @@ def model_slider_change(model_type):
|
|
198 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
|
199 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
|
200 |
gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus"),
|
201 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
202 |
)
|
203 |
return res
|
204 |
|
@@ -216,7 +216,7 @@ def model_slider_change(model_type):
|
|
216 |
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers min"),
|
217 |
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers max"),
|
218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
219 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
220 |
)
|
221 |
return res
|
222 |
|
@@ -233,7 +233,7 @@ def model_slider_change(model_type):
|
|
233 |
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
|
234 |
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
|
235 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
236 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
237 |
)
|
238 |
return res
|
239 |
|
@@ -244,7 +244,7 @@ def focus_change(focus):
|
|
244 |
global model_name, language_model_max_layer
|
245 |
if model_name == "Clip":
|
246 |
res = (
|
247 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
248 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
|
249 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
|
250 |
)
|
@@ -253,14 +253,14 @@ def focus_change(focus):
|
|
253 |
if focus == "Language Model":
|
254 |
if response_type.value == "answer + visualization":
|
255 |
res = (
|
256 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
257 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
258 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
259 |
)
|
260 |
return res
|
261 |
else:
|
262 |
res = (
|
263 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
264 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
265 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
266 |
)
|
@@ -268,7 +268,7 @@ def focus_change(focus):
|
|
268 |
|
269 |
else:
|
270 |
res = (
|
271 |
-
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="
|
272 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
|
273 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
|
274 |
)
|
@@ -283,13 +283,13 @@ with gr.Blocks() as demo:
|
|
283 |
with gr.Row():
|
284 |
with gr.Column():
|
285 |
image_input = gr.Image()
|
286 |
-
|
287 |
|
288 |
with gr.Column():
|
289 |
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-v1.6-Mistral-7B"], value="Clip", label="model")
|
290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
292 |
-
|
293 |
visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
294 |
|
295 |
|
@@ -312,7 +312,7 @@ with gr.Blocks() as demo:
|
|
312 |
visualization_layers_min,
|
313 |
visualization_layers_max,
|
314 |
focus,
|
315 |
-
|
316 |
]
|
317 |
)
|
318 |
|
@@ -320,7 +320,7 @@ with gr.Blocks() as demo:
|
|
320 |
fn = focus_change,
|
321 |
inputs = focus,
|
322 |
outputs=[
|
323 |
-
|
324 |
visualization_layers_min,
|
325 |
visualization_layers_max,
|
326 |
]
|
@@ -329,7 +329,7 @@ with gr.Blocks() as demo:
|
|
329 |
# response_type.change(
|
330 |
# fn = response_type_change,
|
331 |
# inputs = response_type,
|
332 |
-
# outputs = [
|
333 |
# )
|
334 |
|
335 |
|
@@ -424,9 +424,9 @@ with gr.Blocks() as demo:
|
|
424 |
|
425 |
understanding_button.click(
|
426 |
multimodal_understanding,
|
427 |
-
inputs=[model_selector,
|
428 |
visualization_layers_min, visualization_layers_max, focus, response_type],
|
429 |
-
outputs=[understanding_output,
|
430 |
)
|
431 |
|
432 |
demo.launch(share=True)
|
|
|
50 |
# Multimodal Understanding function
|
51 |
@spaces.GPU(duration=120)
|
52 |
def multimodal_understanding(model_type,
|
53 |
+
activation_map_method,
|
54 |
visual_pooling_method,
|
55 |
image, question, seed, top_p, temperature, target_token_idx,
|
56 |
visualization_layer_min, visualization_layer_max, focus, response_type):
|
|
|
72 |
inputs = clip_utils.prepare_inputs([question], image)
|
73 |
|
74 |
|
75 |
+
if activation_map_method == "GradCAM":
|
76 |
# Generate Grad-CAM
|
77 |
all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
|
78 |
if visualization_layers_min.value != visualization_layers_max.value:
|
|
|
117 |
else:
|
118 |
start = 512
|
119 |
|
120 |
+
if activation_map_method == "GradCAM":
|
121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
122 |
if focus == "Visual Encoder":
|
123 |
all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
|
|
|
181 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
|
182 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
|
183 |
gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
|
184 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
185 |
)
|
186 |
return res
|
187 |
elif model_type.split('-')[0] == "Janus":
|
|
|
198 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
|
199 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
|
200 |
gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus"),
|
201 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
202 |
)
|
203 |
return res
|
204 |
|
|
|
216 |
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers min"),
|
217 |
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers max"),
|
218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
219 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
220 |
)
|
221 |
return res
|
222 |
|
|
|
233 |
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
|
234 |
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
|
235 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
236 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
237 |
)
|
238 |
return res
|
239 |
|
|
|
244 |
global model_name, language_model_max_layer
|
245 |
if model_name == "Clip":
|
246 |
res = (
|
247 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
248 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
|
249 |
gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
|
250 |
)
|
|
|
253 |
if focus == "Language Model":
|
254 |
if response_type.value == "answer + visualization":
|
255 |
res = (
|
256 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
257 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
258 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
259 |
)
|
260 |
return res
|
261 |
else:
|
262 |
res = (
|
263 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
264 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
265 |
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
266 |
)
|
|
|
268 |
|
269 |
else:
|
270 |
res = (
|
271 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
|
272 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
|
273 |
gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
|
274 |
)
|
|
|
283 |
with gr.Row():
|
284 |
with gr.Column():
|
285 |
image_input = gr.Image()
|
286 |
+
activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
|
287 |
|
288 |
with gr.Column():
|
289 |
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-v1.6-Mistral-7B"], value="Clip", label="model")
|
290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
292 |
+
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
293 |
visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
294 |
|
295 |
|
|
|
312 |
visualization_layers_min,
|
313 |
visualization_layers_max,
|
314 |
focus,
|
315 |
+
activation_map_method
|
316 |
]
|
317 |
)
|
318 |
|
|
|
320 |
fn = focus_change,
|
321 |
inputs = focus,
|
322 |
outputs=[
|
323 |
+
activation_map_method,
|
324 |
visualization_layers_min,
|
325 |
visualization_layers_max,
|
326 |
]
|
|
|
329 |
# response_type.change(
|
330 |
# fn = response_type_change,
|
331 |
# inputs = response_type,
|
332 |
+
# outputs = [activation_map_method]
|
333 |
# )
|
334 |
|
335 |
|
|
|
424 |
|
425 |
understanding_button.click(
|
426 |
multimodal_understanding,
|
427 |
+
inputs=[model_selector, activation_map_method, visual_pooling_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
|
428 |
visualization_layers_min, visualization_layers_max, focus, response_type],
|
429 |
+
outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
|
430 |
)
|
431 |
|
432 |
demo.launch(share=True)
|