AustingDong commited on
Commit
09b0453
·
1 Parent(s): 3912684

correct terms

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -50,7 +50,7 @@ def clean():
50
  # Multimodal Understanding function
51
  @spaces.GPU(duration=120)
52
  def multimodal_understanding(model_type,
53
- saliency_map_method,
54
  visual_pooling_method,
55
  image, question, seed, top_p, temperature, target_token_idx,
56
  visualization_layer_min, visualization_layer_max, focus, response_type):
@@ -72,7 +72,7 @@ def multimodal_understanding(model_type,
72
  inputs = clip_utils.prepare_inputs([question], image)
73
 
74
 
75
- if saliency_map_method == "GradCAM":
76
  # Generate Grad-CAM
77
  all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
78
  if visualization_layers_min.value != visualization_layers_max.value:
@@ -117,7 +117,7 @@ def multimodal_understanding(model_type,
117
  else:
118
  start = 512
119
 
120
- if saliency_map_method == "GradCAM":
121
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
122
  if focus == "Visual Encoder":
123
  all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
@@ -181,7 +181,7 @@ def model_slider_change(model_type):
181
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
182
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
183
  gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
184
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
185
  )
186
  return res
187
  elif model_type.split('-')[0] == "Janus":
@@ -198,7 +198,7 @@ def model_slider_change(model_type):
198
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
199
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
200
  gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus"),
201
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
202
  )
203
  return res
204
 
@@ -216,7 +216,7 @@ def model_slider_change(model_type):
216
  gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers min"),
217
  gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers max"),
218
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
219
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
220
  )
221
  return res
222
 
@@ -233,7 +233,7 @@ def model_slider_change(model_type):
233
  gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
234
  gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
235
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
236
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
237
  )
238
  return res
239
 
@@ -244,7 +244,7 @@ def focus_change(focus):
244
  global model_name, language_model_max_layer
245
  if model_name == "Clip":
246
  res = (
247
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
248
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
249
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
250
  )
@@ -253,14 +253,14 @@ def focus_change(focus):
253
  if focus == "Language Model":
254
  if response_type.value == "answer + visualization":
255
  res = (
256
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
257
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
258
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
259
  )
260
  return res
261
  else:
262
  res = (
263
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
264
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
265
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
266
  )
@@ -268,7 +268,7 @@ def focus_change(focus):
268
 
269
  else:
270
  res = (
271
- gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
272
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
273
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
274
  )
@@ -283,13 +283,13 @@ with gr.Blocks() as demo:
283
  with gr.Row():
284
  with gr.Column():
285
  image_input = gr.Image()
286
- saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
287
 
288
  with gr.Column():
289
  model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-v1.6-Mistral-7B"], value="Clip", label="model")
290
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
291
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
292
- saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
293
  visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
294
 
295
 
@@ -312,7 +312,7 @@ with gr.Blocks() as demo:
312
  visualization_layers_min,
313
  visualization_layers_max,
314
  focus,
315
- saliency_map_method
316
  ]
317
  )
318
 
@@ -320,7 +320,7 @@ with gr.Blocks() as demo:
320
  fn = focus_change,
321
  inputs = focus,
322
  outputs=[
323
- saliency_map_method,
324
  visualization_layers_min,
325
  visualization_layers_max,
326
  ]
@@ -329,7 +329,7 @@ with gr.Blocks() as demo:
329
  # response_type.change(
330
  # fn = response_type_change,
331
  # inputs = response_type,
332
- # outputs = [saliency_map_method]
333
  # )
334
 
335
 
@@ -424,9 +424,9 @@ with gr.Blocks() as demo:
424
 
425
  understanding_button.click(
426
  multimodal_understanding,
427
- inputs=[model_selector, saliency_map_method, visual_pooling_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
428
  visualization_layers_min, visualization_layers_max, focus, response_type],
429
- outputs=[understanding_output, saliency_map_output, understanding_target_token_decoded_output]
430
  )
431
 
432
  demo.launch(share=True)
 
50
  # Multimodal Understanding function
51
  @spaces.GPU(duration=120)
52
  def multimodal_understanding(model_type,
53
+ activation_map_method,
54
  visual_pooling_method,
55
  image, question, seed, top_p, temperature, target_token_idx,
56
  visualization_layer_min, visualization_layer_max, focus, response_type):
 
72
  inputs = clip_utils.prepare_inputs([question], image)
73
 
74
 
75
+ if activation_map_method == "GradCAM":
76
  # Generate Grad-CAM
77
  all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
78
  if visualization_layers_min.value != visualization_layers_max.value:
 
117
  else:
118
  start = 512
119
 
120
+ if activation_map_method == "GradCAM":
121
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
122
  if focus == "Visual Encoder":
123
  all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
 
181
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
182
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
183
  gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
184
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
185
  )
186
  return res
187
  elif model_type.split('-')[0] == "Janus":
 
198
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
199
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
200
  gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus"),
201
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
202
  )
203
  return res
204
 
 
216
  gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers min"),
217
  gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers max"),
218
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
219
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
220
  )
221
  return res
222
 
 
233
  gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
234
  gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
235
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
236
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
237
  )
238
  return res
239
 
 
244
  global model_name, language_model_max_layer
245
  if model_name == "Clip":
246
  res = (
247
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
248
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
249
  gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
250
  )
 
253
  if focus == "Language Model":
254
  if response_type.value == "answer + visualization":
255
  res = (
256
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
257
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
258
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
259
  )
260
  return res
261
  else:
262
  res = (
263
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
264
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
265
  gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
266
  )
 
268
 
269
  else:
270
  res = (
271
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type"),
272
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
273
  gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
274
  )
 
283
  with gr.Row():
284
  with gr.Column():
285
  image_input = gr.Image()
286
+ activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
287
 
288
  with gr.Column():
289
  model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-v1.6-Mistral-7B"], value="Clip", label="model")
290
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
291
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
292
+ activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
293
  visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
294
 
295
 
 
312
  visualization_layers_min,
313
  visualization_layers_max,
314
  focus,
315
+ activation_map_method
316
  ]
317
  )
318
 
 
320
  fn = focus_change,
321
  inputs = focus,
322
  outputs=[
323
+ activation_map_method,
324
  visualization_layers_min,
325
  visualization_layers_max,
326
  ]
 
329
  # response_type.change(
330
  # fn = response_type_change,
331
  # inputs = response_type,
332
+ # outputs = [activation_map_method]
333
  # )
334
 
335
 
 
424
 
425
  understanding_button.click(
426
  multimodal_understanding,
427
+ inputs=[model_selector, activation_map_method, visual_pooling_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
428
  visualization_layers_min, visualization_layers_max, focus, response_type],
429
+ outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
430
  )
431
 
432
  demo.launch(share=True)