AustingDong
commited on
Commit
·
a907ad0
1
Parent(s):
b1faf64
modify best layer for ChartGemma, removed some outputs
Browse files- app.py +4 -4
- demo/cam.py +1 -35
app.py
CHANGED
@@ -139,7 +139,7 @@ def multimodal_understanding(model_type,
|
|
139 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
140 |
gradcam.remove_hooks()
|
141 |
|
142 |
-
|
143 |
if focus == "Visual Encoder":
|
144 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
145 |
cam = [generate_gradcam(cam_grid, image)]
|
@@ -226,12 +226,12 @@ def model_slider_change(model_type):
|
|
226 |
model_utils = ChartGemma_Utils()
|
227 |
vl_gpt, tokenizer = model_utils.init_ChartGemma()
|
228 |
language_model_max_layer = 18
|
229 |
-
language_model_best_layer =
|
230 |
|
231 |
res = (
|
232 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
233 |
-
gr.Slider(minimum=1, maximum=18, value=
|
234 |
-
gr.Slider(minimum=1, maximum=18, value=
|
235 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
236 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
237 |
)
|
|
|
139 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
140 |
gradcam.remove_hooks()
|
141 |
|
142 |
+
|
143 |
if focus == "Visual Encoder":
|
144 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
145 |
cam = [generate_gradcam(cam_grid, image)]
|
|
|
226 |
model_utils = ChartGemma_Utils()
|
227 |
vl_gpt, tokenizer = model_utils.init_ChartGemma()
|
228 |
language_model_max_layer = 18
|
229 |
+
language_model_best_layer = 15
|
230 |
|
231 |
res = (
|
232 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
233 |
+
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers min"),
|
234 |
+
gr.Slider(minimum=1, maximum=18, value=15, step=1, label="visualization layers max"),
|
235 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
236 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
237 |
)
|
demo/cam.py
CHANGED
@@ -75,7 +75,6 @@ class AttentionGuidedCAMClip(AttentionGuidedCAM):
|
|
75 |
output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
|
76 |
|
77 |
# Aggregate activations and gradients from ALL layers
|
78 |
-
print(self.activations, self.gradients)
|
79 |
self.model.zero_grad()
|
80 |
cam_sum = None
|
81 |
for act, grad in zip(self.activations, self.gradients):
|
@@ -93,7 +92,7 @@ class AttentionGuidedCAMClip(AttentionGuidedCAM):
|
|
93 |
cam, _ = (act * grad_weights).max(dim=-1)
|
94 |
# cam, _ = grad_weights.max(dim=-1)
|
95 |
# cam = self.normalize(cam)
|
96 |
-
print(cam.shape)
|
97 |
|
98 |
# Sum across all layers
|
99 |
if cam_sum is None:
|
@@ -239,8 +238,6 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
239 |
cam_sum = None
|
240 |
for act, grad in zip(self.activations, self.gradients):
|
241 |
# act = torch.sigmoid(act)
|
242 |
-
print("act:", act)
|
243 |
-
print(len(act))
|
244 |
print("act_shape:", act.shape)
|
245 |
# print("act1_shape:", act[1].shape)
|
246 |
|
@@ -248,14 +245,9 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
248 |
|
249 |
|
250 |
# Compute mean of gradients
|
251 |
-
print("grad:", grad)
|
252 |
-
print(len(grad))
|
253 |
print("grad_shape:", grad.shape)
|
254 |
grad_weights = grad.mean(dim=1)
|
255 |
|
256 |
-
print("act:", act)
|
257 |
-
print("act shape", act.shape)
|
258 |
-
print("grad_weights shape", grad_weights.shape)
|
259 |
|
260 |
# cam, _ = (act * grad_weights).max(dim=-1)
|
261 |
# cam = act * grad_weights
|
@@ -361,23 +353,15 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
361 |
cam_sum = None
|
362 |
for act, grad in zip(self.activations, self.gradients):
|
363 |
# act = torch.sigmoid(act)
|
364 |
-
print("act:", act)
|
365 |
-
print(len(act))
|
366 |
print("act_shape:", act.shape)
|
367 |
-
# print("act1_shape:", act[1].shape)
|
368 |
|
369 |
act = F.relu(act.mean(dim=1))
|
370 |
|
371 |
|
372 |
# Compute mean of gradients
|
373 |
-
print("grad:", grad)
|
374 |
-
print(len(grad))
|
375 |
print("grad_shape:", grad.shape)
|
376 |
grad_weights = grad.mean(dim=1)
|
377 |
|
378 |
-
print("act:", act)
|
379 |
-
print("act shape", act.shape)
|
380 |
-
print("grad_weights shape", grad_weights.shape)
|
381 |
|
382 |
# cam, _ = (act * grad_weights).max(dim=-1)
|
383 |
# cam = act * grad_weights
|
@@ -477,23 +461,6 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
|
477 |
# Forward pass
|
478 |
outputs_raw = self.model(**inputs)
|
479 |
|
480 |
-
image_embeddings = outputs_raw.image_hidden_states
|
481 |
-
inputs_embeddings = self.model.get_input_embeddings()(inputs['input_ids'])
|
482 |
-
|
483 |
-
# Pooling
|
484 |
-
image_embeddings_pooled = image_embeddings.mean(dim=1)
|
485 |
-
|
486 |
-
inputs_embeddings_pooled = inputs_embeddings.mean(dim=1) # end of image: 618
|
487 |
-
# inputs_embeddings_pooled = inputs_embeddings[
|
488 |
-
# torch.arange(inputs_embeddings.shape[0], device=inputs_embeddings.device),
|
489 |
-
# input_ids.to(dtype=torch.int, device=inputs_embeddings.device).argmax(dim=-1),
|
490 |
-
# ]
|
491 |
-
|
492 |
-
|
493 |
-
# Backpropagate to get gradients
|
494 |
-
# image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
|
495 |
-
# similarity = F.cosine_similarity(image_embeddings_mean, inputs_embeddings_mean, dim=-1)
|
496 |
-
# similarity.backward()
|
497 |
self.model.zero_grad()
|
498 |
print(outputs_raw)
|
499 |
# loss = self.target_layers[-1].attention_map.sum()
|
@@ -505,7 +472,6 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
|
505 |
last = 0
|
506 |
for i in range(inputs["input_ids"].shape[1]):
|
507 |
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
508 |
-
print(decoded_token)
|
509 |
if (decoded_token == "<image>"):
|
510 |
image_mask.append(True)
|
511 |
last = i
|
|
|
75 |
output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
|
76 |
|
77 |
# Aggregate activations and gradients from ALL layers
|
|
|
78 |
self.model.zero_grad()
|
79 |
cam_sum = None
|
80 |
for act, grad in zip(self.activations, self.gradients):
|
|
|
92 |
cam, _ = (act * grad_weights).max(dim=-1)
|
93 |
# cam, _ = grad_weights.max(dim=-1)
|
94 |
# cam = self.normalize(cam)
|
95 |
+
print("cam_shape: ", cam.shape)
|
96 |
|
97 |
# Sum across all layers
|
98 |
if cam_sum is None:
|
|
|
238 |
cam_sum = None
|
239 |
for act, grad in zip(self.activations, self.gradients):
|
240 |
# act = torch.sigmoid(act)
|
|
|
|
|
241 |
print("act_shape:", act.shape)
|
242 |
# print("act1_shape:", act[1].shape)
|
243 |
|
|
|
245 |
|
246 |
|
247 |
# Compute mean of gradients
|
|
|
|
|
248 |
print("grad_shape:", grad.shape)
|
249 |
grad_weights = grad.mean(dim=1)
|
250 |
|
|
|
|
|
|
|
251 |
|
252 |
# cam, _ = (act * grad_weights).max(dim=-1)
|
253 |
# cam = act * grad_weights
|
|
|
353 |
cam_sum = None
|
354 |
for act, grad in zip(self.activations, self.gradients):
|
355 |
# act = torch.sigmoid(act)
|
|
|
|
|
356 |
print("act_shape:", act.shape)
|
|
|
357 |
|
358 |
act = F.relu(act.mean(dim=1))
|
359 |
|
360 |
|
361 |
# Compute mean of gradients
|
|
|
|
|
362 |
print("grad_shape:", grad.shape)
|
363 |
grad_weights = grad.mean(dim=1)
|
364 |
|
|
|
|
|
|
|
365 |
|
366 |
# cam, _ = (act * grad_weights).max(dim=-1)
|
367 |
# cam = act * grad_weights
|
|
|
461 |
# Forward pass
|
462 |
outputs_raw = self.model(**inputs)
|
463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
self.model.zero_grad()
|
465 |
print(outputs_raw)
|
466 |
# loss = self.target_layers[-1].attention_map.sum()
|
|
|
472 |
last = 0
|
473 |
for i in range(inputs["input_ids"].shape[1]):
|
474 |
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
|
|
475 |
if (decoded_token == "<image>"):
|
476 |
image_mask.append(True)
|
477 |
last = i
|