AustingDong
commited on
Commit
·
b1faf64
1
Parent(s):
1ca9e3b
add ChartGemma
Browse files- app.py +47 -12
- demo/cam.py +188 -1
- demo/model_utils.py +56 -1
app.py
CHANGED
@@ -3,8 +3,8 @@ import torch
|
|
3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
4 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
5 |
from janus.utils.io import load_pil_images
|
6 |
-
from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMLLaVA
|
7 |
-
from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, add_title_to_image
|
8 |
|
9 |
import numpy as np
|
10 |
import matplotlib.pyplot as plt
|
@@ -22,7 +22,8 @@ clip_utils = Clip_Utils()
|
|
22 |
clip_utils.init_Clip()
|
23 |
model_utils, vl_gpt, tokenizer = None, None, None
|
24 |
model_name = "Clip"
|
25 |
-
|
|
|
26 |
|
27 |
def clean():
|
28 |
global model_utils, vl_gpt, tokenizer, clip_utils
|
@@ -109,7 +110,12 @@ def multimodal_understanding(model_type,
|
|
109 |
|
110 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
111 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
if saliency_map_method == "GradCAM":
|
115 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
@@ -127,8 +133,13 @@ def multimodal_understanding(model_type,
|
|
127 |
gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
|
128 |
elif model_name.split('-')[0] == "LLaVA":
|
129 |
gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
|
|
|
|
|
|
|
130 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
131 |
gradcam.remove_hooks()
|
|
|
|
|
132 |
if focus == "Visual Encoder":
|
133 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
134 |
cam = [generate_gradcam(cam_grid, image)]
|
@@ -144,7 +155,7 @@ def multimodal_understanding(model_type,
|
|
144 |
else:
|
145 |
cam = []
|
146 |
for i, cam_tensor in enumerate(cam_tensors):
|
147 |
-
cam_grid = cam_tensor.reshape(
|
148 |
cam_i = generate_gradcam(cam_grid, image)
|
149 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
150 |
|
@@ -158,7 +169,7 @@ def multimodal_understanding(model_type,
|
|
158 |
# Gradio interface
|
159 |
|
160 |
def model_slider_change(model_type):
|
161 |
-
global model_utils, vl_gpt, tokenizer, clip_utils, model_name
|
162 |
model_name = model_type
|
163 |
if model_type == "Clip":
|
164 |
clean()
|
@@ -179,6 +190,8 @@ def model_slider_change(model_type):
|
|
179 |
set_seed()
|
180 |
model_utils = Janus_Utils()
|
181 |
vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
|
|
|
|
|
182 |
|
183 |
res = (
|
184 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
@@ -195,6 +208,8 @@ def model_slider_change(model_type):
|
|
195 |
set_seed()
|
196 |
model_utils = LLaVA_Utils()
|
197 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
|
|
|
|
198 |
|
199 |
res = (
|
200 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
@@ -204,9 +219,29 @@ def model_slider_change(model_type):
|
|
204 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
205 |
)
|
206 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
def focus_change(focus):
|
209 |
-
global model_name
|
210 |
if model_name == "Clip":
|
211 |
res = (
|
212 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
@@ -219,15 +254,15 @@ def focus_change(focus):
|
|
219 |
if response_type.value == "answer + visualization":
|
220 |
res = (
|
221 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
222 |
-
gr.Slider(minimum=1, maximum=
|
223 |
-
gr.Slider(minimum=1, maximum=
|
224 |
)
|
225 |
return res
|
226 |
else:
|
227 |
res = (
|
228 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
229 |
-
gr.Slider(minimum=1, maximum=
|
230 |
-
gr.Slider(minimum=1, maximum=
|
231 |
)
|
232 |
return res
|
233 |
|
@@ -251,7 +286,7 @@ with gr.Blocks() as demo:
|
|
251 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
252 |
|
253 |
with gr.Column():
|
254 |
-
model_selector = gr.Dropdown(choices=["Clip", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
255 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
256 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
257 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
|
|
3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
4 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
5 |
from janus.utils.io import load_pil_images
|
6 |
+
from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMChartGemma, AttentionGuidedCAMLLaVA
|
7 |
+
from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, ChartGemma_Utils, add_title_to_image
|
8 |
|
9 |
import numpy as np
|
10 |
import matplotlib.pyplot as plt
|
|
|
22 |
clip_utils.init_Clip()
|
23 |
model_utils, vl_gpt, tokenizer = None, None, None
|
24 |
model_name = "Clip"
|
25 |
+
language_model_max_layer = 24
|
26 |
+
language_model_best_layer = 8
|
27 |
|
28 |
def clean():
|
29 |
global model_utils, vl_gpt, tokenizer, clip_utils
|
|
|
110 |
|
111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
113 |
+
if model_name.split('-')[0] == "Janus":
|
114 |
+
start = 620
|
115 |
+
elif model_name.split('-')[0] == "ChartGemma":
|
116 |
+
start = 1024
|
117 |
+
else:
|
118 |
+
start = 512
|
119 |
|
120 |
if saliency_map_method == "GradCAM":
|
121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
|
133 |
gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
|
134 |
elif model_name.split('-')[0] == "LLaVA":
|
135 |
gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
|
136 |
+
elif model_name.split('-')[0] == "ChartGemma":
|
137 |
+
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
138 |
+
|
139 |
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
140 |
gradcam.remove_hooks()
|
141 |
+
|
142 |
+
|
143 |
if focus == "Visual Encoder":
|
144 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
145 |
cam = [generate_gradcam(cam_grid, image)]
|
|
|
155 |
else:
|
156 |
cam = []
|
157 |
for i, cam_tensor in enumerate(cam_tensors):
|
158 |
+
cam_grid = cam_tensor.reshape(grid_size, grid_size)
|
159 |
cam_i = generate_gradcam(cam_grid, image)
|
160 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
161 |
|
|
|
169 |
# Gradio interface
|
170 |
|
171 |
def model_slider_change(model_type):
|
172 |
+
global model_utils, vl_gpt, tokenizer, clip_utils, model_name, language_model_max_layer, language_model_best_layer
|
173 |
model_name = model_type
|
174 |
if model_type == "Clip":
|
175 |
clean()
|
|
|
190 |
set_seed()
|
191 |
model_utils = Janus_Utils()
|
192 |
vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
|
193 |
+
language_model_max_layer = 24
|
194 |
+
language_model_best_layer = 8
|
195 |
|
196 |
res = (
|
197 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
|
208 |
set_seed()
|
209 |
model_utils = LLaVA_Utils()
|
210 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
211 |
+
language_model_max_layer = 24
|
212 |
+
language_model_best_layer = 8
|
213 |
|
214 |
res = (
|
215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
|
|
219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
220 |
)
|
221 |
return res
|
222 |
+
|
223 |
+
elif model_type.split('-')[0] == "ChartGemma":
|
224 |
+
clean()
|
225 |
+
set_seed()
|
226 |
+
model_utils = ChartGemma_Utils()
|
227 |
+
vl_gpt, tokenizer = model_utils.init_ChartGemma()
|
228 |
+
language_model_max_layer = 18
|
229 |
+
language_model_best_layer = 12
|
230 |
+
|
231 |
+
res = (
|
232 |
+
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
233 |
+
gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers min"),
|
234 |
+
gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers max"),
|
235 |
+
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
236 |
+
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
237 |
+
)
|
238 |
+
return res
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
|
243 |
def focus_change(focus):
|
244 |
+
global model_name, language_model_max_layer
|
245 |
if model_name == "Clip":
|
246 |
res = (
|
247 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
|
|
254 |
if response_type.value == "answer + visualization":
|
255 |
res = (
|
256 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
257 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
258 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
259 |
)
|
260 |
return res
|
261 |
else:
|
262 |
res = (
|
263 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
|
264 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
265 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
|
266 |
)
|
267 |
return res
|
268 |
|
|
|
286 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
287 |
|
288 |
with gr.Column():
|
289 |
+
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-2B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
292 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
demo/cam.py
CHANGED
@@ -229,8 +229,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
229 |
|
230 |
|
231 |
elif focus == "Language Model":
|
232 |
-
loss = self.target_layers[-1].attention_map.sum()
|
233 |
self.model.zero_grad()
|
|
|
234 |
loss.backward()
|
235 |
|
236 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
@@ -429,6 +429,193 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
429 |
|
430 |
|
431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
def generate_gradcam(
|
433 |
cam,
|
434 |
image,
|
|
|
229 |
|
230 |
|
231 |
elif focus == "Language Model":
|
|
|
232 |
self.model.zero_grad()
|
233 |
+
loss = outputs.logits.max(dim=-1).values.sum()
|
234 |
loss.backward()
|
235 |
|
236 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
|
|
429 |
|
430 |
|
431 |
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
|
437 |
+
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
|
442 |
+
|
443 |
+
|
444 |
+
class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
445 |
+
def __init__(self, model, target_layers):
|
446 |
+
self.target_layers = target_layers
|
447 |
+
super().__init__(model)
|
448 |
+
self._modify_layers()
|
449 |
+
self._register_hooks_activations()
|
450 |
+
|
451 |
+
def _modify_layers(self):
|
452 |
+
for layer in self.target_layers:
|
453 |
+
setattr(layer, "attn_gradients", None)
|
454 |
+
setattr(layer, "attention_map", None)
|
455 |
+
|
456 |
+
layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
|
457 |
+
layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
|
458 |
+
layer.save_attn_map = types.MethodType(save_attn_map, layer)
|
459 |
+
layer.get_attn_map = types.MethodType(get_attn_map, layer)
|
460 |
+
|
461 |
+
def _forward_activate_hooks(self, module, input, output):
|
462 |
+
attn_output, attn_weights = output # Unpack outputs
|
463 |
+
print("attn_output shape:", attn_output.shape)
|
464 |
+
print("attn_weights shape:", attn_weights.shape)
|
465 |
+
module.save_attn_map(attn_weights)
|
466 |
+
attn_weights.register_hook(module.save_attn_gradients)
|
467 |
+
|
468 |
+
def _register_hooks_activations(self):
|
469 |
+
for layer in self.target_layers:
|
470 |
+
if hasattr(layer, "q_proj"): # is an attention layer
|
471 |
+
self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
|
472 |
+
|
473 |
+
@spaces.GPU(duration=120)
|
474 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
|
475 |
+
""" Generates Grad-CAM heatmap for ViT. """
|
476 |
+
|
477 |
+
# Forward pass
|
478 |
+
outputs_raw = self.model(**inputs)
|
479 |
+
|
480 |
+
image_embeddings = outputs_raw.image_hidden_states
|
481 |
+
inputs_embeddings = self.model.get_input_embeddings()(inputs['input_ids'])
|
482 |
+
|
483 |
+
# Pooling
|
484 |
+
image_embeddings_pooled = image_embeddings.mean(dim=1)
|
485 |
+
|
486 |
+
inputs_embeddings_pooled = inputs_embeddings.mean(dim=1) # end of image: 618
|
487 |
+
# inputs_embeddings_pooled = inputs_embeddings[
|
488 |
+
# torch.arange(inputs_embeddings.shape[0], device=inputs_embeddings.device),
|
489 |
+
# input_ids.to(dtype=torch.int, device=inputs_embeddings.device).argmax(dim=-1),
|
490 |
+
# ]
|
491 |
+
|
492 |
+
|
493 |
+
# Backpropagate to get gradients
|
494 |
+
# image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
|
495 |
+
# similarity = F.cosine_similarity(image_embeddings_mean, inputs_embeddings_mean, dim=-1)
|
496 |
+
# similarity.backward()
|
497 |
+
self.model.zero_grad()
|
498 |
+
print(outputs_raw)
|
499 |
+
# loss = self.target_layers[-1].attention_map.sum()
|
500 |
+
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
501 |
+
loss.backward()
|
502 |
+
|
503 |
+
# get image masks
|
504 |
+
image_mask = []
|
505 |
+
last = 0
|
506 |
+
for i in range(inputs["input_ids"].shape[1]):
|
507 |
+
decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
|
508 |
+
print(decoded_token)
|
509 |
+
if (decoded_token == "<image>"):
|
510 |
+
image_mask.append(True)
|
511 |
+
last = i
|
512 |
+
else:
|
513 |
+
image_mask.append(False)
|
514 |
+
|
515 |
+
|
516 |
+
# Aggregate activations and gradients from ALL layers
|
517 |
+
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
518 |
+
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
519 |
+
cam_sum = None
|
520 |
+
# Ver 1
|
521 |
+
# for act, grad in zip(self.activations, self.gradients):
|
522 |
+
# # act = torch.sigmoid(act)
|
523 |
+
# print("act:", act)
|
524 |
+
# print(len(act))
|
525 |
+
# print("act_shape:", act.shape)
|
526 |
+
# # print("act1_shape:", act[1].shape)
|
527 |
+
|
528 |
+
# act = F.relu(act.mean(dim=1))
|
529 |
+
|
530 |
+
|
531 |
+
# # Compute mean of gradients
|
532 |
+
# print("grad:", grad)
|
533 |
+
# print(len(grad))
|
534 |
+
# print("grad_shape:", grad.shape)
|
535 |
+
# grad_weights = grad.mean(dim=1)
|
536 |
+
|
537 |
+
# print("act shape", act.shape)
|
538 |
+
# print("grad_weights shape", grad_weights.shape)
|
539 |
+
|
540 |
+
# cam = act * grad_weights
|
541 |
+
# # cam = act
|
542 |
+
# print(cam.shape)
|
543 |
+
|
544 |
+
# # Sum across all layers
|
545 |
+
# if cam_sum is None:
|
546 |
+
# cam_sum = cam
|
547 |
+
# else:
|
548 |
+
# cam_sum += cam
|
549 |
+
|
550 |
+
# Ver 2
|
551 |
+
for act, grad in zip(self.activations, self.gradients):
|
552 |
+
|
553 |
+
print("act shape", act.shape)
|
554 |
+
print("grad shape", grad.shape)
|
555 |
+
|
556 |
+
act = F.relu(act)
|
557 |
+
grad = F.relu(grad)
|
558 |
+
|
559 |
+
|
560 |
+
cam = act * grad # shape: [1, heads, seq_len, seq_len]
|
561 |
+
cam = cam.sum(dim=1) # shape: [1, seq_len, seq_len]
|
562 |
+
|
563 |
+
# Sum across all layers
|
564 |
+
if cam_sum is None:
|
565 |
+
cam_sum = cam
|
566 |
+
else:
|
567 |
+
cam_sum += cam
|
568 |
+
|
569 |
+
cam_sum = F.relu(cam_sum)
|
570 |
+
cam_sum = cam_sum.to(torch.float32)
|
571 |
+
|
572 |
+
# thresholding
|
573 |
+
# percentile = torch.quantile(cam_sum, 0.4) # Adjust threshold dynamically
|
574 |
+
# cam_sum[cam_sum < percentile] = 0
|
575 |
+
|
576 |
+
# Reshape
|
577 |
+
# if visual_pooling_method == "CLS":
|
578 |
+
# cam_sum = cam_sum[0, 1:]
|
579 |
+
|
580 |
+
# cam_sum shape: [1, seq_len, seq_len]
|
581 |
+
cam_sum_lst = []
|
582 |
+
cam_sum_raw = cam_sum
|
583 |
+
start_idx = 1024
|
584 |
+
for i in range(start_idx, cam_sum_raw.shape[1]):
|
585 |
+
cam_sum = cam_sum_raw[0, i, :] # shape: [1: seq_len]
|
586 |
+
# cam_sum_min = cam_sum.min()
|
587 |
+
# cam_sum_max = cam_sum.max()
|
588 |
+
# cam_sum = (cam_sum - cam_sum_min) / (cam_sum_max - cam_sum_min)
|
589 |
+
cam_sum = cam_sum[image_mask].unsqueeze(0) # shape: [1, 1024]
|
590 |
+
print("cam_sum shape: ", cam_sum.shape)
|
591 |
+
num_patches = cam_sum.shape[-1] # Last dimension of CAM output
|
592 |
+
grid_size = int(num_patches ** 0.5)
|
593 |
+
print(f"Detected grid size: {grid_size}x{grid_size}")
|
594 |
+
|
595 |
+
# Fix the reshaping step dynamically
|
596 |
+
|
597 |
+
cam_sum = cam_sum.view(grid_size, grid_size)
|
598 |
+
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
599 |
+
cam_sum_lst.append(cam_sum)
|
600 |
+
|
601 |
+
|
602 |
+
return cam_sum_lst, grid_size
|
603 |
+
|
604 |
+
|
605 |
+
|
606 |
+
|
607 |
+
|
608 |
+
|
609 |
+
|
610 |
+
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
|
615 |
+
|
616 |
+
|
617 |
+
|
618 |
+
|
619 |
def generate_gradcam(
|
620 |
cam,
|
621 |
image,
|
demo/model_utils.py
CHANGED
@@ -2,7 +2,7 @@ import torch
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
-
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
@@ -170,6 +170,61 @@ class LLaVA_Utils(Model_Utils):
|
|
170 |
)
|
171 |
|
172 |
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
def add_title_to_image(image, title, font_size=20):
|
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor, PaliGemmaForConditionalGeneration
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
|
|
170 |
)
|
171 |
|
172 |
return outputs
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
class ChartGemma_Utils(Model_Utils):
|
179 |
+
def __init__(self):
|
180 |
+
super().__init__()
|
181 |
+
|
182 |
+
def init_ChartGemma(self):
|
183 |
+
|
184 |
+
model_path = "ahmed-masry/chartgemma"
|
185 |
+
|
186 |
+
|
187 |
+
self.vl_gpt = PaliGemmaForConditionalGeneration.from_pretrained(
|
188 |
+
model_path,
|
189 |
+
torch_dtype=torch.float16,
|
190 |
+
attn_implementation="eager",
|
191 |
+
output_attentions=True
|
192 |
+
)
|
193 |
+
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
194 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
195 |
+
self.tokenizer = self.processor.tokenizer
|
196 |
+
|
197 |
+
return self.vl_gpt, self.tokenizer
|
198 |
+
|
199 |
+
@spaces.GPU(duration=120)
|
200 |
+
def prepare_inputs(self, question, image):
|
201 |
+
|
202 |
+
pil_image = Image.fromarray(image)
|
203 |
+
prepare_inputs = self.processor(
|
204 |
+
images=pil_image, text=[question], return_tensors="pt"
|
205 |
+
).to(self.cuda_device, dtype=self.dtype)
|
206 |
+
|
207 |
+
return prepare_inputs
|
208 |
+
|
209 |
+
@spaces.GPU(duration=120)
|
210 |
+
def generate_inputs_embeddings(self, prepare_inputs):
|
211 |
+
return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
|
212 |
+
|
213 |
+
@spaces.GPU(duration=120)
|
214 |
+
def generate_outputs(self, prepare_inputs, temperature, top_p):
|
215 |
+
|
216 |
+
outputs = self.vl_gpt.generate(
|
217 |
+
**prepare_inputs,
|
218 |
+
max_new_tokens=512,
|
219 |
+
do_sample=False if temperature == 0 else True,
|
220 |
+
use_cache=True,
|
221 |
+
return_dict_in_generate=True,
|
222 |
+
output_attentions=True
|
223 |
+
)
|
224 |
+
|
225 |
+
return outputs
|
226 |
+
|
227 |
+
|
228 |
|
229 |
|
230 |
def add_title_to_image(image, title, font_size=20):
|