AustingDong commited on
Commit
b1faf64
·
1 Parent(s): 1ca9e3b

add ChartGemma

Browse files
Files changed (3) hide show
  1. app.py +47 -12
  2. demo/cam.py +188 -1
  3. demo/model_utils.py +56 -1
app.py CHANGED
@@ -3,8 +3,8 @@ import torch
3
  from transformers import AutoConfig, AutoModelForCausalLM
4
  from janus.models import MultiModalityCausalLM, VLChatProcessor
5
  from janus.utils.io import load_pil_images
6
- from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMLLaVA
7
- from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, add_title_to_image
8
 
9
  import numpy as np
10
  import matplotlib.pyplot as plt
@@ -22,7 +22,8 @@ clip_utils = Clip_Utils()
22
  clip_utils.init_Clip()
23
  model_utils, vl_gpt, tokenizer = None, None, None
24
  model_name = "Clip"
25
-
 
26
 
27
  def clean():
28
  global model_utils, vl_gpt, tokenizer, clip_utils
@@ -109,7 +110,12 @@ def multimodal_understanding(model_type,
109
 
110
  input_ids = prepare_inputs.input_ids[0].cpu().tolist()
111
  input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
112
- start=620 if model_name.split('-')[0] == "Janus" else 512
 
 
 
 
 
113
 
114
  if saliency_map_method == "GradCAM":
115
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
@@ -127,8 +133,13 @@ def multimodal_understanding(model_type,
127
  gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
128
  elif model_name.split('-')[0] == "LLaVA":
129
  gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
 
 
 
130
  cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
131
  gradcam.remove_hooks()
 
 
132
  if focus == "Visual Encoder":
133
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
134
  cam = [generate_gradcam(cam_grid, image)]
@@ -144,7 +155,7 @@ def multimodal_understanding(model_type,
144
  else:
145
  cam = []
146
  for i, cam_tensor in enumerate(cam_tensors):
147
- cam_grid = cam_tensor.reshape(24, 24)
148
  cam_i = generate_gradcam(cam_grid, image)
149
  cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
150
 
@@ -158,7 +169,7 @@ def multimodal_understanding(model_type,
158
  # Gradio interface
159
 
160
  def model_slider_change(model_type):
161
- global model_utils, vl_gpt, tokenizer, clip_utils, model_name
162
  model_name = model_type
163
  if model_type == "Clip":
164
  clean()
@@ -179,6 +190,8 @@ def model_slider_change(model_type):
179
  set_seed()
180
  model_utils = Janus_Utils()
181
  vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
 
 
182
 
183
  res = (
184
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
@@ -195,6 +208,8 @@ def model_slider_change(model_type):
195
  set_seed()
196
  model_utils = LLaVA_Utils()
197
  vl_gpt, tokenizer = model_utils.init_LLaVA()
 
 
198
 
199
  res = (
200
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
@@ -204,9 +219,29 @@ def model_slider_change(model_type):
204
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
205
  )
206
  return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  def focus_change(focus):
209
- global model_name
210
  if model_name == "Clip":
211
  res = (
212
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
@@ -219,15 +254,15 @@ def focus_change(focus):
219
  if response_type.value == "answer + visualization":
220
  res = (
221
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
222
- gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers min"),
223
- gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers max")
224
  )
225
  return res
226
  else:
227
  res = (
228
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
229
- gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers min"),
230
- gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers max")
231
  )
232
  return res
233
 
@@ -251,7 +286,7 @@ with gr.Blocks() as demo:
251
  saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
252
 
253
  with gr.Column():
254
- model_selector = gr.Dropdown(choices=["Clip", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
255
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
256
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
257
  saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
 
3
  from transformers import AutoConfig, AutoModelForCausalLM
4
  from janus.models import MultiModalityCausalLM, VLChatProcessor
5
  from janus.utils.io import load_pil_images
6
+ from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMChartGemma, AttentionGuidedCAMLLaVA
7
+ from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, ChartGemma_Utils, add_title_to_image
8
 
9
  import numpy as np
10
  import matplotlib.pyplot as plt
 
22
  clip_utils.init_Clip()
23
  model_utils, vl_gpt, tokenizer = None, None, None
24
  model_name = "Clip"
25
+ language_model_max_layer = 24
26
+ language_model_best_layer = 8
27
 
28
  def clean():
29
  global model_utils, vl_gpt, tokenizer, clip_utils
 
110
 
111
  input_ids = prepare_inputs.input_ids[0].cpu().tolist()
112
  input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
113
+ if model_name.split('-')[0] == "Janus":
114
+ start = 620
115
+ elif model_name.split('-')[0] == "ChartGemma":
116
+ start = 1024
117
+ else:
118
+ start = 512
119
 
120
  if saliency_map_method == "GradCAM":
121
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
 
133
  gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
134
  elif model_name.split('-')[0] == "LLaVA":
135
  gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
136
+ elif model_name.split('-')[0] == "ChartGemma":
137
+ gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
138
+
139
  cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
140
  gradcam.remove_hooks()
141
+
142
+
143
  if focus == "Visual Encoder":
144
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
145
  cam = [generate_gradcam(cam_grid, image)]
 
155
  else:
156
  cam = []
157
  for i, cam_tensor in enumerate(cam_tensors):
158
+ cam_grid = cam_tensor.reshape(grid_size, grid_size)
159
  cam_i = generate_gradcam(cam_grid, image)
160
  cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
161
 
 
169
  # Gradio interface
170
 
171
  def model_slider_change(model_type):
172
+ global model_utils, vl_gpt, tokenizer, clip_utils, model_name, language_model_max_layer, language_model_best_layer
173
  model_name = model_type
174
  if model_type == "Clip":
175
  clean()
 
190
  set_seed()
191
  model_utils = Janus_Utils()
192
  vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
193
+ language_model_max_layer = 24
194
+ language_model_best_layer = 8
195
 
196
  res = (
197
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
 
208
  set_seed()
209
  model_utils = LLaVA_Utils()
210
  vl_gpt, tokenizer = model_utils.init_LLaVA()
211
+ language_model_max_layer = 24
212
+ language_model_best_layer = 8
213
 
214
  res = (
215
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
 
219
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
220
  )
221
  return res
222
+
223
+ elif model_type.split('-')[0] == "ChartGemma":
224
+ clean()
225
+ set_seed()
226
+ model_utils = ChartGemma_Utils()
227
+ vl_gpt, tokenizer = model_utils.init_ChartGemma()
228
+ language_model_max_layer = 18
229
+ language_model_best_layer = 12
230
+
231
+ res = (
232
+ gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
233
+ gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers min"),
234
+ gr.Slider(minimum=1, maximum=18, value=12, step=1, label="visualization layers max"),
235
+ gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
236
+ gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
237
+ )
238
+ return res
239
+
240
+
241
+
242
 
243
  def focus_change(focus):
244
+ global model_name, language_model_max_layer
245
  if model_name == "Clip":
246
  res = (
247
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
 
254
  if response_type.value == "answer + visualization":
255
  res = (
256
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
257
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
258
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
259
  )
260
  return res
261
  else:
262
  res = (
263
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
264
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
265
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max")
266
  )
267
  return res
268
 
 
286
  saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
287
 
288
  with gr.Column():
289
+ model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-2B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
290
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
291
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
292
  saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
demo/cam.py CHANGED
@@ -229,8 +229,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
229
 
230
 
231
  elif focus == "Language Model":
232
- loss = self.target_layers[-1].attention_map.sum()
233
  self.model.zero_grad()
 
234
  loss.backward()
235
 
236
  self.activations = [layer.get_attn_map() for layer in self.target_layers]
@@ -429,6 +429,193 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
429
 
430
 
431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  def generate_gradcam(
433
  cam,
434
  image,
 
229
 
230
 
231
  elif focus == "Language Model":
 
232
  self.model.zero_grad()
233
+ loss = outputs.logits.max(dim=-1).values.sum()
234
  loss.backward()
235
 
236
  self.activations = [layer.get_attn_map() for layer in self.target_layers]
 
429
 
430
 
431
 
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
445
+ def __init__(self, model, target_layers):
446
+ self.target_layers = target_layers
447
+ super().__init__(model)
448
+ self._modify_layers()
449
+ self._register_hooks_activations()
450
+
451
+ def _modify_layers(self):
452
+ for layer in self.target_layers:
453
+ setattr(layer, "attn_gradients", None)
454
+ setattr(layer, "attention_map", None)
455
+
456
+ layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
457
+ layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
458
+ layer.save_attn_map = types.MethodType(save_attn_map, layer)
459
+ layer.get_attn_map = types.MethodType(get_attn_map, layer)
460
+
461
+ def _forward_activate_hooks(self, module, input, output):
462
+ attn_output, attn_weights = output # Unpack outputs
463
+ print("attn_output shape:", attn_output.shape)
464
+ print("attn_weights shape:", attn_weights.shape)
465
+ module.save_attn_map(attn_weights)
466
+ attn_weights.register_hook(module.save_attn_gradients)
467
+
468
+ def _register_hooks_activations(self):
469
+ for layer in self.target_layers:
470
+ if hasattr(layer, "q_proj"): # is an attention layer
471
+ self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
472
+
473
+ @spaces.GPU(duration=120)
474
+ def generate_cam(self, inputs, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
475
+ """ Generates Grad-CAM heatmap for ViT. """
476
+
477
+ # Forward pass
478
+ outputs_raw = self.model(**inputs)
479
+
480
+ image_embeddings = outputs_raw.image_hidden_states
481
+ inputs_embeddings = self.model.get_input_embeddings()(inputs['input_ids'])
482
+
483
+ # Pooling
484
+ image_embeddings_pooled = image_embeddings.mean(dim=1)
485
+
486
+ inputs_embeddings_pooled = inputs_embeddings.mean(dim=1) # end of image: 618
487
+ # inputs_embeddings_pooled = inputs_embeddings[
488
+ # torch.arange(inputs_embeddings.shape[0], device=inputs_embeddings.device),
489
+ # input_ids.to(dtype=torch.int, device=inputs_embeddings.device).argmax(dim=-1),
490
+ # ]
491
+
492
+
493
+ # Backpropagate to get gradients
494
+ # image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
495
+ # similarity = F.cosine_similarity(image_embeddings_mean, inputs_embeddings_mean, dim=-1)
496
+ # similarity.backward()
497
+ self.model.zero_grad()
498
+ print(outputs_raw)
499
+ # loss = self.target_layers[-1].attention_map.sum()
500
+ loss = outputs_raw.logits.max(dim=-1).values.sum()
501
+ loss.backward()
502
+
503
+ # get image masks
504
+ image_mask = []
505
+ last = 0
506
+ for i in range(inputs["input_ids"].shape[1]):
507
+ decoded_token = tokenizer.decode(inputs["input_ids"][0][i].item())
508
+ print(decoded_token)
509
+ if (decoded_token == "<image>"):
510
+ image_mask.append(True)
511
+ last = i
512
+ else:
513
+ image_mask.append(False)
514
+
515
+
516
+ # Aggregate activations and gradients from ALL layers
517
+ self.activations = [layer.get_attn_map() for layer in self.target_layers]
518
+ self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
519
+ cam_sum = None
520
+ # Ver 1
521
+ # for act, grad in zip(self.activations, self.gradients):
522
+ # # act = torch.sigmoid(act)
523
+ # print("act:", act)
524
+ # print(len(act))
525
+ # print("act_shape:", act.shape)
526
+ # # print("act1_shape:", act[1].shape)
527
+
528
+ # act = F.relu(act.mean(dim=1))
529
+
530
+
531
+ # # Compute mean of gradients
532
+ # print("grad:", grad)
533
+ # print(len(grad))
534
+ # print("grad_shape:", grad.shape)
535
+ # grad_weights = grad.mean(dim=1)
536
+
537
+ # print("act shape", act.shape)
538
+ # print("grad_weights shape", grad_weights.shape)
539
+
540
+ # cam = act * grad_weights
541
+ # # cam = act
542
+ # print(cam.shape)
543
+
544
+ # # Sum across all layers
545
+ # if cam_sum is None:
546
+ # cam_sum = cam
547
+ # else:
548
+ # cam_sum += cam
549
+
550
+ # Ver 2
551
+ for act, grad in zip(self.activations, self.gradients):
552
+
553
+ print("act shape", act.shape)
554
+ print("grad shape", grad.shape)
555
+
556
+ act = F.relu(act)
557
+ grad = F.relu(grad)
558
+
559
+
560
+ cam = act * grad # shape: [1, heads, seq_len, seq_len]
561
+ cam = cam.sum(dim=1) # shape: [1, seq_len, seq_len]
562
+
563
+ # Sum across all layers
564
+ if cam_sum is None:
565
+ cam_sum = cam
566
+ else:
567
+ cam_sum += cam
568
+
569
+ cam_sum = F.relu(cam_sum)
570
+ cam_sum = cam_sum.to(torch.float32)
571
+
572
+ # thresholding
573
+ # percentile = torch.quantile(cam_sum, 0.4) # Adjust threshold dynamically
574
+ # cam_sum[cam_sum < percentile] = 0
575
+
576
+ # Reshape
577
+ # if visual_pooling_method == "CLS":
578
+ # cam_sum = cam_sum[0, 1:]
579
+
580
+ # cam_sum shape: [1, seq_len, seq_len]
581
+ cam_sum_lst = []
582
+ cam_sum_raw = cam_sum
583
+ start_idx = 1024
584
+ for i in range(start_idx, cam_sum_raw.shape[1]):
585
+ cam_sum = cam_sum_raw[0, i, :] # shape: [1: seq_len]
586
+ # cam_sum_min = cam_sum.min()
587
+ # cam_sum_max = cam_sum.max()
588
+ # cam_sum = (cam_sum - cam_sum_min) / (cam_sum_max - cam_sum_min)
589
+ cam_sum = cam_sum[image_mask].unsqueeze(0) # shape: [1, 1024]
590
+ print("cam_sum shape: ", cam_sum.shape)
591
+ num_patches = cam_sum.shape[-1] # Last dimension of CAM output
592
+ grid_size = int(num_patches ** 0.5)
593
+ print(f"Detected grid size: {grid_size}x{grid_size}")
594
+
595
+ # Fix the reshaping step dynamically
596
+
597
+ cam_sum = cam_sum.view(grid_size, grid_size)
598
+ cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
599
+ cam_sum_lst.append(cam_sum)
600
+
601
+
602
+ return cam_sum_lst, grid_size
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+
619
  def generate_gradcam(
620
  cam,
621
  image,
demo/model_utils.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  import numpy as np
3
  import spaces
4
  from PIL import Image, ImageDraw, ImageFont
5
- from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor
6
  from transformers import CLIPProcessor, CLIPModel
7
  from janus.models import MultiModalityCausalLM, VLChatProcessor
8
 
@@ -170,6 +170,61 @@ class LLaVA_Utils(Model_Utils):
170
  )
171
 
172
  return outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
 
175
  def add_title_to_image(image, title, font_size=20):
 
2
  import numpy as np
3
  import spaces
4
  from PIL import Image, ImageDraw, ImageFont
5
+ from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor, PaliGemmaForConditionalGeneration
6
  from transformers import CLIPProcessor, CLIPModel
7
  from janus.models import MultiModalityCausalLM, VLChatProcessor
8
 
 
170
  )
171
 
172
  return outputs
173
+
174
+
175
+
176
+
177
+
178
+ class ChartGemma_Utils(Model_Utils):
179
+ def __init__(self):
180
+ super().__init__()
181
+
182
+ def init_ChartGemma(self):
183
+
184
+ model_path = "ahmed-masry/chartgemma"
185
+
186
+
187
+ self.vl_gpt = PaliGemmaForConditionalGeneration.from_pretrained(
188
+ model_path,
189
+ torch_dtype=torch.float16,
190
+ attn_implementation="eager",
191
+ output_attentions=True
192
+ )
193
+ self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
194
+ self.processor = AutoProcessor.from_pretrained(model_path)
195
+ self.tokenizer = self.processor.tokenizer
196
+
197
+ return self.vl_gpt, self.tokenizer
198
+
199
+ @spaces.GPU(duration=120)
200
+ def prepare_inputs(self, question, image):
201
+
202
+ pil_image = Image.fromarray(image)
203
+ prepare_inputs = self.processor(
204
+ images=pil_image, text=[question], return_tensors="pt"
205
+ ).to(self.cuda_device, dtype=self.dtype)
206
+
207
+ return prepare_inputs
208
+
209
+ @spaces.GPU(duration=120)
210
+ def generate_inputs_embeddings(self, prepare_inputs):
211
+ return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
212
+
213
+ @spaces.GPU(duration=120)
214
+ def generate_outputs(self, prepare_inputs, temperature, top_p):
215
+
216
+ outputs = self.vl_gpt.generate(
217
+ **prepare_inputs,
218
+ max_new_tokens=512,
219
+ do_sample=False if temperature == 0 else True,
220
+ use_cache=True,
221
+ return_dict_in_generate=True,
222
+ output_attentions=True
223
+ )
224
+
225
+ return outputs
226
+
227
+
228
 
229
 
230
  def add_title_to_image(image, title, font_size=20):