Image Feature Extraction
Transformers
msj9817 commited on
Commit
952642c
·
verified ·
1 Parent(s): 92eedf0

Update evaluation codes

Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py CHANGED
@@ -7,8 +7,6 @@ import torch
7
  from tqdm import tqdm
8
  import json
9
  from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
10
- import argparse
11
-
12
 
13
 
14
  def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
@@ -51,8 +49,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -70,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
70
  return_overflowing_tokens=False,
71
  padding="max_length",
72
  return_tensors="pt",
73
- )["input_ids"].to(device) # torch.Size([1, 77])
74
-
75
- #img1 = preprocess(img1).unsqueeze(0).to(device)
76
- #img2 = preprocess(img2).unsqueeze(0).to(device)
77
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
- imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
 
81
  with torch.no_grad():
82
  model.eval().float()
83
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
  outputs1 = model(input_ids=text1, pixel_values=imgs)
86
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,17 +134,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
141
 
142
  if __name__ == "__main__":
143
 
144
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
 
146
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
- vision_tower_name = f'MetaCLIP_huge/metaclip-h14-fullcc2.5b-6000'
148
 
149
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
- #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
 
154
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
155
  print(results)
156
-
157
-
 
7
  from tqdm import tqdm
8
  import json
9
  from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
 
 
10
 
11
 
12
  def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
 
49
  text1 = 'a photo of ' + statement1
50
  text2 = 'a photo of ' + statement2
51
 
 
 
52
  text1 = tokenizer(
53
  text1,
54
  truncation=True,
 
66
  return_overflowing_tokens=False,
67
  padding="max_length",
68
  return_tensors="pt",
69
+ )["input_ids"].to(device)
70
+
71
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
72
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
73
+ imgs = torch.cat((img1, img2), dim=0)
 
 
74
 
75
  with torch.no_grad():
76
  model.eval().float()
77
+
 
78
  outputs1 = model(input_ids=text1, pixel_values=imgs)
79
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
80
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
134
 
135
  if __name__ == "__main__":
136
 
137
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
138
 
139
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
140
+ vision_tower_name = f'MetaCLIP/metaclip-h14-fullcc2.5b'
141
 
142
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
143
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
144
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
 
145
 
146
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
147
  print(results)
 
 
evaluation/evaluate_mmvp_MetaCLIP_large.py CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -70,18 +68,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
70
  return_overflowing_tokens=False,
71
  padding="max_length",
72
  return_tensors="pt",
73
- )["input_ids"].to(device) # torch.Size([1, 77])
74
-
75
- #img1 = preprocess(img1).unsqueeze(0).to(device)
76
- #img2 = preprocess(img2).unsqueeze(0).to(device)
77
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
- imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
 
81
  with torch.no_grad():
82
  model.eval().float()
83
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
  outputs1 = model(input_ids=text1, pixel_values=imgs)
86
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -92,7 +87,7 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
92
 
93
  img1_score1 = probs1[0][0]
94
  img1_score2 = probs2[0][0]
95
-
96
  pred1 = "img1" if img1_score1 > 0.5 else "img2"
97
  pred2 = "img1" if img1_score2 > 0.5 else "img2"
98
 
@@ -141,15 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
141
 
142
  if __name__ == "__main__":
143
 
144
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
 
146
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
- vision_tower_name = 'MetaCLIP_large/metaclip-l14-fullcc2.5b-7000'
148
 
149
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
- #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
 
154
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
155
  print(results)
 
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
 
 
54
  text1 = tokenizer(
55
  text1,
56
  truncation=True,
 
68
  return_overflowing_tokens=False,
69
  padding="max_length",
70
  return_tensors="pt",
71
+ )["input_ids"].to(device)
72
+
73
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
74
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
75
+ imgs = torch.cat((img1, img2), dim=0)
 
 
76
 
77
  with torch.no_grad():
78
  model.eval().float()
79
+
 
80
  outputs1 = model(input_ids=text1, pixel_values=imgs)
81
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
82
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
87
 
88
  img1_score1 = probs1[0][0]
89
  img1_score2 = probs2[0][0]
90
+
91
  pred1 = "img1" if img1_score1 > 0.5 else "img2"
92
  pred2 = "img1" if img1_score2 > 0.5 else "img2"
93
 
 
136
 
137
  if __name__ == "__main__":
138
 
139
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
140
 
141
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
142
+ vision_tower_name = 'MetaCLIP/metaclip-l14-fullcc2.5b'
143
 
144
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
145
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
146
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
 
147
 
148
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
149
  print(results)
evaluation/evaluate_mmvp_OpenAICLIP_224.py CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
71
  padding="max_length",
72
  return_tensors="pt",
73
  )["input_ids"].to(device) # torch.Size([1, 77])
74
-
75
- #img1 = preprocess(img1).unsqueeze(0).to(device)
76
- #img2 = preprocess(img2).unsqueeze(0).to(device)
77
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
  imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
 
81
  with torch.no_grad():
82
  model.eval().float()
83
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
  outputs1 = model(input_ids=text1, pixel_values=imgs)
86
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
141
 
142
  if __name__ == "__main__":
143
 
144
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
 
146
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
- vision_tower_name = f'OpenAICLIP_224/clip-vit-large-patch14-all-lr5-3000-res384'
148
 
149
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
- #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
-
154
- #vision_tower.to(torch.float32)
155
- # print(next(model.parameters()).device) # cuda:0
156
 
157
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
158
  print(results)
159
-
 
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
 
 
54
  text1 = tokenizer(
55
  text1,
56
  truncation=True,
 
69
  padding="max_length",
70
  return_tensors="pt",
71
  )["input_ids"].to(device) # torch.Size([1, 77])
72
+
73
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
74
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
 
 
75
  imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
76
 
77
  with torch.no_grad():
78
  model.eval().float()
79
+
 
80
  outputs1 = model(input_ids=text1, pixel_values=imgs)
81
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
82
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
136
 
137
  if __name__ == "__main__":
138
 
139
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
140
 
141
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
142
+ vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14'
143
 
144
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
145
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
146
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
 
 
 
 
147
 
148
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
149
  print(results)
 
evaluation/evaluate_mmvp_OpenAICLIP_336.py CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
71
  padding="max_length",
72
  return_tensors="pt",
73
  )["input_ids"].to(device) # torch.Size([1, 77])
74
-
75
- #img1 = preprocess(img1).unsqueeze(0).to(device)
76
- #img2 = preprocess(img2).unsqueeze(0).to(device)
77
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
- imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
 
81
  with torch.no_grad():
82
  model.eval().float()
83
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
  outputs1 = model(input_ids=text1, pixel_values=imgs)
86
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
141
 
142
  if __name__ == "__main__":
143
 
144
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
 
146
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
- vision_tower_name = f'OpenAICLIP_336/clip-vit-large-patch14-336-all-lr5-3500-512-tokens'
148
 
149
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
- #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
-
154
- #vision_tower.to(torch.float32)
155
- # print(next(model.parameters()).device) # cuda:0
156
 
157
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
158
  print(results)
159
-
 
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
 
 
54
  text1 = tokenizer(
55
  text1,
56
  truncation=True,
 
69
  padding="max_length",
70
  return_tensors="pt",
71
  )["input_ids"].to(device) # torch.Size([1, 77])
72
+
73
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
74
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
75
+ imgs = torch.cat((img1, img2), dim=0)
 
 
76
 
77
  with torch.no_grad():
78
  model.eval().float()
79
+
 
80
  outputs1 = model(input_ids=text1, pixel_values=imgs)
81
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
82
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
136
 
137
  if __name__ == "__main__":
138
 
139
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
140
 
141
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
142
+ vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14-336'
143
 
144
  vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
145
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
146
  tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
 
 
 
 
147
 
148
  results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
149
  print(results)
 
evaluation/evaluate_mmvp_SigLIP_224.py CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
68
  return_overflowing_tokens=False,
69
  padding="max_length",
70
  return_tensors="pt",
71
- )["input_ids"].to(device) # torch.Size([1, 77])
72
-
73
- #img1 = preprocess(img1).unsqueeze(0).to(device)
74
- #img2 = preprocess(img2).unsqueeze(0).to(device)
75
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
76
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
77
- imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
78
 
79
  with torch.no_grad():
80
  model.eval().float()
81
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
82
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
83
  outputs1 = model(input_ids=text1, pixel_values=imgs)
84
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
85
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
139
 
140
  if __name__ == "__main__":
141
 
142
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
143
 
144
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145
- vision_tower_name = f'SigLIP_224/siglip-so400m-patch14-224-9000'
146
 
147
  vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
148
  image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
 
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
 
 
54
  text1 = tokenizer(
55
  text1,
56
  truncation=True,
 
66
  return_overflowing_tokens=False,
67
  padding="max_length",
68
  return_tensors="pt",
69
+ )["input_ids"].to(device)
70
+
71
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
72
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
73
+ imgs = torch.cat((img1, img2), dim=0)
 
 
74
 
75
  with torch.no_grad():
76
  model.eval().float()
77
+
 
78
  outputs1 = model(input_ids=text1, pixel_values=imgs)
79
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
80
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
134
 
135
  if __name__ == "__main__":
136
 
137
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
138
 
139
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
140
+ vision_tower_name = f'SigLIP/siglip-so400m-patch14-224'
141
 
142
  vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
143
  image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
evaluation/evaluate_mmvp_SigLIP_384.py CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
54
- #text1 = clip.tokenize([text1]).to(device)
55
- #text2 = clip.tokenize([text2]).to(device)
56
  text1 = tokenizer(
57
  text1,
58
  truncation=True,
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
68
  return_overflowing_tokens=False,
69
  padding="max_length",
70
  return_tensors="pt",
71
- )["input_ids"].to(device) # torch.Size([1, 77])
72
-
73
- #img1 = preprocess(img1).unsqueeze(0).to(device)
74
- #img2 = preprocess(img2).unsqueeze(0).to(device)
75
- img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
76
- img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
77
- imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
78
 
79
  with torch.no_grad():
80
  model.eval().float()
81
- #logits_per_image1, logits_per_text1 = model(imgs, text1)
82
- #logits_per_image2, logits_per_text2 = model(imgs, text2)
83
  outputs1 = model(input_ids=text1, pixel_values=imgs)
84
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
85
  outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
139
 
140
  if __name__ == "__main__":
141
 
142
- BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
143
 
144
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145
- vision_tower_name = f'SigLIP_384/siglip-so400m-patch14-384-7500'
146
 
147
  vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
148
  image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
 
51
  text1 = 'a photo of ' + statement1
52
  text2 = 'a photo of ' + statement2
53
 
 
 
54
  text1 = tokenizer(
55
  text1,
56
  truncation=True,
 
66
  return_overflowing_tokens=False,
67
  padding="max_length",
68
  return_tensors="pt",
69
+ )["input_ids"].to(device)
70
+
71
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
72
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
73
+ imgs = torch.cat((img1, img2), dim=0)
 
 
74
 
75
  with torch.no_grad():
76
  model.eval().float()
77
+
 
78
  outputs1 = model(input_ids=text1, pixel_values=imgs)
79
  logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
80
  outputs2 = model(input_ids=text2, pixel_values=imgs)
 
134
 
135
  if __name__ == "__main__":
136
 
137
+ BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
138
 
139
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
140
+ vision_tower_name = f'SigLIP/siglip-so400m-patch14-384'
141
 
142
  vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
143
  image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)