AustingDong commited on
Commit
a907ad0
·
1 Parent(s): b1faf64

modify best layer for ChartGemma, removed some outputs

Browse files
Files changed (2) hide show
  1. app.py +4 -4
  2. demo/cam.py +1 -35
app.py CHANGED
@@ -139,7 +139,7 @@ def multimodal_understanding(model_type,
139
  cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
140
  gradcam.remove_hooks()
141
 
142
-
143
  if focus == "Visual Encoder":
144
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
145
  cam = [generate_gradcam(cam_grid, image)]
@@ -226,12 +226,12 @@ def model_slider_change(model_type):
226
  model_utils = ChartGemma_Utils()
227
  vl_gpt, tokenizer = model_utils.init_ChartGemma()
228
  language_model_max_layer = 18
229
- language_model_best_layer = 12
230
 
231
  res = (
232
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
233
- gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers min"),
234
- gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers max"),
235
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
236
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
237
  )
 
139
  cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
140
  gradcam.remove_hooks()
141
 
142
+
143
  if focus == "Visual Encoder":
144
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
145
  cam = [generate_gradcam(cam_grid, image)]
 
226
  model_utils = ChartGemma_Utils()
227
  vl_gpt, tokenizer = model_utils.init_ChartGemma()
228
  language_model_max_layer = 18
229
+ language_model_best_layer = 15
230
 
231
  res = (
232
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
233
+ gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
234
+ gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
235
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
236
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
237
  )
demo/cam.py CHANGED
@@ -75,7 +75,6 @@ class AttentionGuidedCAMClip(AttentionGuidedCAM):
75
  output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
76
 
77
  # Aggregate activations and gradients from ALL layers
78
- print(self.activations, self.gradients)
79
  self.model.zero_grad()
80
  cam_sum = None
81
  for act, grad in zip(self.activations, self.gradients):
@@ -93,7 +92,7 @@ class AttentionGuidedCAMClip(AttentionGuidedCAM):
93
  cam, _ = (act * grad_weights).max(dim=-1)
94
  # cam, _ = grad_weights.max(dim=-1)
95
  # cam = self.normalize(cam)
96
- print(cam.shape)
97
 
98
  # Sum across all layers
99
  if cam_sum is None:
@@ -239,8 +238,6 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
239
  cam_sum = None
240
  for act, grad in zip(self.activations, self.gradients):
241
  # act = torch.sigmoid(act)
242
- print("act:", act)
243
- print(len(act))
244
  print("act_shape:", act.shape)
245
  # print("act1_shape:", act[1].shape)
246
 
@@ -248,14 +245,9 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
248
 
249
 
250
  # Compute mean of gradients
251
- print("grad:", grad)
252
- print(len(grad))
253
  print("grad_shape:", grad.shape)
254
  grad_weights = grad.mean(dim=1)
255
 
256
- print("act:", act)
257
- print("act shape", act.shape)
258
- print("grad_weights shape", grad_weights.shape)
259
 
260
  # cam, _ = (act * grad_weights).max(dim=-1)
261
  # cam = act * grad_weights
@@ -361,23 +353,15 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
361
  cam_sum = None
362
  for act, grad in zip(self.activations, self.gradients):
363
  # act = torch.sigmoid(act)
364
- print("act:", act)
365
- print(len(act))
366
  print("act_shape:", act.shape)
367
- # print("act1_shape:", act[1].shape)
368
 
369
  act = F.relu(act.mean(dim=1))
370
 
371
 
372
  # Compute mean of gradients
373
- print("grad:", grad)
374
- print(len(grad))
375
  print("grad_shape:", grad.shape)
376
  grad_weights = grad.mean(dim=1)
377
 
378
- print("act:", act)
379
- print("act shape", act.shape)
380
- print("grad_weights shape", grad_weights.shape)
381
 
382
  # cam, _ = (act * grad_weights).max(dim=-1)
383
  # cam = act * grad_weights
@@ -477,23 +461,6 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
477
  # Forward pass
478
  outputs_raw = self.model(**inputs)
479
 
480
- image_embeddings = outputs_raw.image_hidden_states
481
- inputs_embeddings = self.model.get_input_embeddings()(inputs['input_ids'])
482
-
483
- # Pooling
484
- image_embeddings_pooled = image_embeddings.mean(dim=1)
485
-
486
- inputs_embeddings_pooled = inputs_embeddings.mean(dim=1) # end of image: 618
487
- # inputs_embeddings_pooled = inputs_embeddings[
488
- # torch.arange(inputs_embeddings.shape[0], device=inputs_embeddings.device),
489
- # input_ids.to(dtype=torch.int, device=inputs_embeddings.device).argmax(dim=-1),
490
- # ]
491
-
492
-
493
- # Backpropagate to get gradients
494
- # image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
495
- # similarity = F.cosine_similarity(image_embeddings_mean, inputs_embeddings_mean, dim=-1)
496
- # similarity.backward()
497
  self.model.zero_grad()
498
  print(outputs_raw)
499
  # loss = self.target_layers[-1].attention_map.sum()
@@ -505,7 +472,6 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
505
  last = 0
506
  for i in range(inputs["input_ids"].shape[1]):
507
  decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
508
- print(decoded_token)
509
  if (decoded_token == "<image>"):
510
  image_mask.append(True)
511
  last = i
 
75
  output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
76
 
77
  # Aggregate activations and gradients from ALL layers
 
78
  self.model.zero_grad()
79
  cam_sum = None
80
  for act, grad in zip(self.activations, self.gradients):
 
92
  cam, _ = (act * grad_weights).max(dim=-1)
93
  # cam, _ = grad_weights.max(dim=-1)
94
  # cam = self.normalize(cam)
95
+ print("cam_shape: ", cam.shape)
96
 
97
  # Sum across all layers
98
  if cam_sum is None:
 
238
  cam_sum = None
239
  for act, grad in zip(self.activations, self.gradients):
240
  # act = torch.sigmoid(act)
 
 
241
  print("act_shape:", act.shape)
242
  # print("act1_shape:", act[1].shape)
243
 
 
245
 
246
 
247
  # Compute mean of gradients
 
 
248
  print("grad_shape:", grad.shape)
249
  grad_weights = grad.mean(dim=1)
250
 
 
 
 
251
 
252
  # cam, _ = (act * grad_weights).max(dim=-1)
253
  # cam = act * grad_weights
 
353
  cam_sum = None
354
  for act, grad in zip(self.activations, self.gradients):
355
  # act = torch.sigmoid(act)
 
 
356
  print("act_shape:", act.shape)
 
357
 
358
  act = F.relu(act.mean(dim=1))
359
 
360
 
361
  # Compute mean of gradients
 
 
362
  print("grad_shape:", grad.shape)
363
  grad_weights = grad.mean(dim=1)
364
 
 
 
 
365
 
366
  # cam, _ = (act * grad_weights).max(dim=-1)
367
  # cam = act * grad_weights
 
461
  # Forward pass
462
  outputs_raw = self.model(**inputs)
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  self.model.zero_grad()
465
  print(outputs_raw)
466
  # loss = self.target_layers[-1].attention_map.sum()
 
472
  last = 0
473
  for i in range(inputs["input_ids"].shape[1]):
474
  decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
 
475
  if (decoded_token == "<image>"):
476
  image_mask.append(True)
477
  last = i