Update evaluation codes
Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_MetaCLIP_huge', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
max_length=77,
|
60 |
+
return_length=False,
|
61 |
+
return_overflowing_tokens=False,
|
62 |
+
padding="max_length",
|
63 |
+
return_tensors="pt",
|
64 |
+
)["input_ids"].to(device)
|
65 |
+
text2 = tokenizer(
|
66 |
+
text2,
|
67 |
+
truncation=True,
|
68 |
+
max_length=77,
|
69 |
+
return_length=False,
|
70 |
+
return_overflowing_tokens=False,
|
71 |
+
padding="max_length",
|
72 |
+
return_tensors="pt",
|
73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
+
|
75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
model.eval().float()
|
83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
89 |
+
|
90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
92 |
+
|
93 |
+
img1_score1 = probs1[0][0]
|
94 |
+
img1_score2 = probs2[0][0]
|
95 |
+
|
96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
98 |
+
|
99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
101 |
+
|
102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
103 |
+
|
104 |
+
current_category = categories[num_pairs // 15]
|
105 |
+
if pred1 == gt1 and pred2 == gt2:
|
106 |
+
pair_accuracies[current_category] += 1
|
107 |
+
num_pairs += 1
|
108 |
+
|
109 |
+
csv_outfile.close()
|
110 |
+
|
111 |
+
# Calculate percentage accuracies
|
112 |
+
Category_Score_List = []
|
113 |
+
|
114 |
+
for category in pair_accuracies:
|
115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
116 |
+
Category_Score_List.append(pair_accuracies[category])
|
117 |
+
|
118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
119 |
+
|
120 |
+
return pair_accuracies
|
121 |
+
|
122 |
+
|
123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
124 |
+
|
125 |
+
with torch.no_grad():
|
126 |
+
clip_model.eval()
|
127 |
+
|
128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
129 |
+
|
130 |
+
# Merge results
|
131 |
+
results = {**results_openai}
|
132 |
+
|
133 |
+
# Convert results to format suitable for star plot
|
134 |
+
categories = results[list(results.keys())[0]].keys()
|
135 |
+
data = {'Categories': list(categories)}
|
136 |
+
for model in list(results_openai.keys()):
|
137 |
+
data[model] = [results[model][category] for category in categories]
|
138 |
+
|
139 |
+
return results
|
140 |
+
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
|
144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
145 |
+
|
146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
+
vision_tower_name = f'MetaCLIP_huge/metaclip-h14-fullcc2.5b-6000'
|
148 |
+
|
149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
+
|
154 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
155 |
+
print(results)
|
156 |
+
|
157 |
+
|
evaluation/evaluate_mmvp_MetaCLIP_large.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_MetaCLIP_large', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
max_length=77,
|
60 |
+
return_length=False,
|
61 |
+
return_overflowing_tokens=False,
|
62 |
+
padding="max_length",
|
63 |
+
return_tensors="pt",
|
64 |
+
)["input_ids"].to(device)
|
65 |
+
text2 = tokenizer(
|
66 |
+
text2,
|
67 |
+
truncation=True,
|
68 |
+
max_length=77,
|
69 |
+
return_length=False,
|
70 |
+
return_overflowing_tokens=False,
|
71 |
+
padding="max_length",
|
72 |
+
return_tensors="pt",
|
73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
+
|
75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
model.eval().float()
|
83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
89 |
+
|
90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
92 |
+
|
93 |
+
img1_score1 = probs1[0][0]
|
94 |
+
img1_score2 = probs2[0][0]
|
95 |
+
|
96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
98 |
+
|
99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
101 |
+
|
102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
103 |
+
|
104 |
+
current_category = categories[num_pairs // 15]
|
105 |
+
if pred1 == gt1 and pred2 == gt2:
|
106 |
+
pair_accuracies[current_category] += 1
|
107 |
+
num_pairs += 1
|
108 |
+
|
109 |
+
csv_outfile.close()
|
110 |
+
|
111 |
+
# Calculate percentage accuracies
|
112 |
+
Category_Score_List = []
|
113 |
+
|
114 |
+
for category in pair_accuracies:
|
115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
116 |
+
Category_Score_List.append(pair_accuracies[category])
|
117 |
+
|
118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
119 |
+
|
120 |
+
return pair_accuracies
|
121 |
+
|
122 |
+
|
123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
124 |
+
|
125 |
+
with torch.no_grad():
|
126 |
+
clip_model.eval()
|
127 |
+
|
128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
129 |
+
|
130 |
+
# Merge results
|
131 |
+
results = {**results_openai}
|
132 |
+
|
133 |
+
# Convert results to format suitable for star plot
|
134 |
+
categories = results[list(results.keys())[0]].keys()
|
135 |
+
data = {'Categories': list(categories)}
|
136 |
+
for model in list(results_openai.keys()):
|
137 |
+
data[model] = [results[model][category] for category in categories]
|
138 |
+
|
139 |
+
return results
|
140 |
+
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
|
144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
145 |
+
|
146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
+
vision_tower_name = 'MetaCLIP_large/metaclip-l14-fullcc2.5b-7000'
|
148 |
+
|
149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
+
|
154 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
155 |
+
print(results)
|
156 |
+
|
157 |
+
|
evaluation/evaluate_mmvp_OpenAICLIP_224.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
max_length=77,
|
60 |
+
return_length=False,
|
61 |
+
return_overflowing_tokens=False,
|
62 |
+
padding="max_length",
|
63 |
+
return_tensors="pt",
|
64 |
+
)["input_ids"].to(device)
|
65 |
+
text2 = tokenizer(
|
66 |
+
text2,
|
67 |
+
truncation=True,
|
68 |
+
max_length=77,
|
69 |
+
return_length=False,
|
70 |
+
return_overflowing_tokens=False,
|
71 |
+
padding="max_length",
|
72 |
+
return_tensors="pt",
|
73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
+
|
75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
model.eval().float()
|
83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
89 |
+
|
90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
92 |
+
|
93 |
+
img1_score1 = probs1[0][0]
|
94 |
+
img1_score2 = probs2[0][0]
|
95 |
+
|
96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
98 |
+
|
99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
101 |
+
|
102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
103 |
+
|
104 |
+
current_category = categories[num_pairs // 15]
|
105 |
+
if pred1 == gt1 and pred2 == gt2:
|
106 |
+
pair_accuracies[current_category] += 1
|
107 |
+
num_pairs += 1
|
108 |
+
|
109 |
+
csv_outfile.close()
|
110 |
+
|
111 |
+
# Calculate percentage accuracies
|
112 |
+
Category_Score_List = []
|
113 |
+
|
114 |
+
for category in pair_accuracies:
|
115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
116 |
+
Category_Score_List.append(pair_accuracies[category])
|
117 |
+
|
118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
119 |
+
|
120 |
+
return pair_accuracies
|
121 |
+
|
122 |
+
|
123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
124 |
+
|
125 |
+
with torch.no_grad():
|
126 |
+
clip_model.eval()
|
127 |
+
|
128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
129 |
+
|
130 |
+
# Merge results
|
131 |
+
results = {**results_openai}
|
132 |
+
|
133 |
+
# Convert results to format suitable for star plot
|
134 |
+
categories = results[list(results.keys())[0]].keys()
|
135 |
+
data = {'Categories': list(categories)}
|
136 |
+
for model in list(results_openai.keys()):
|
137 |
+
data[model] = [results[model][category] for category in categories]
|
138 |
+
|
139 |
+
return results
|
140 |
+
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
|
144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
145 |
+
|
146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
+
vision_tower_name = f'OpenAICLIP_224/clip-vit-large-patch14-all-lr5-3000-res384'
|
148 |
+
|
149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
+
|
154 |
+
#vision_tower.to(torch.float32)
|
155 |
+
# print(next(model.parameters()).device) # cuda:0
|
156 |
+
|
157 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
158 |
+
print(results)
|
159 |
+
|
evaluation/evaluate_mmvp_OpenAICLIP_336.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
max_length=77,
|
60 |
+
return_length=False,
|
61 |
+
return_overflowing_tokens=False,
|
62 |
+
padding="max_length",
|
63 |
+
return_tensors="pt",
|
64 |
+
)["input_ids"].to(device)
|
65 |
+
text2 = tokenizer(
|
66 |
+
text2,
|
67 |
+
truncation=True,
|
68 |
+
max_length=77,
|
69 |
+
return_length=False,
|
70 |
+
return_overflowing_tokens=False,
|
71 |
+
padding="max_length",
|
72 |
+
return_tensors="pt",
|
73 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
+
|
75 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
76 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
77 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
78 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
+
|
81 |
+
with torch.no_grad():
|
82 |
+
model.eval().float()
|
83 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
84 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
88 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
89 |
+
|
90 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
91 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
92 |
+
|
93 |
+
img1_score1 = probs1[0][0]
|
94 |
+
img1_score2 = probs2[0][0]
|
95 |
+
|
96 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
97 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
98 |
+
|
99 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
100 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
101 |
+
|
102 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
103 |
+
|
104 |
+
current_category = categories[num_pairs // 15]
|
105 |
+
if pred1 == gt1 and pred2 == gt2:
|
106 |
+
pair_accuracies[current_category] += 1
|
107 |
+
num_pairs += 1
|
108 |
+
|
109 |
+
csv_outfile.close()
|
110 |
+
|
111 |
+
# Calculate percentage accuracies
|
112 |
+
Category_Score_List = []
|
113 |
+
|
114 |
+
for category in pair_accuracies:
|
115 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
116 |
+
Category_Score_List.append(pair_accuracies[category])
|
117 |
+
|
118 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
119 |
+
|
120 |
+
return pair_accuracies
|
121 |
+
|
122 |
+
|
123 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
124 |
+
|
125 |
+
with torch.no_grad():
|
126 |
+
clip_model.eval()
|
127 |
+
|
128 |
+
results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
129 |
+
|
130 |
+
# Merge results
|
131 |
+
results = {**results_openai}
|
132 |
+
|
133 |
+
# Convert results to format suitable for star plot
|
134 |
+
categories = results[list(results.keys())[0]].keys()
|
135 |
+
data = {'Categories': list(categories)}
|
136 |
+
for model in list(results_openai.keys()):
|
137 |
+
data[model] = [results[model][category] for category in categories]
|
138 |
+
|
139 |
+
return results
|
140 |
+
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
|
144 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
145 |
+
|
146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
+
vision_tower_name = f'OpenAICLIP_336/clip-vit-large-patch14-336-all-lr5-3500-512-tokens'
|
148 |
+
|
149 |
+
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
+
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
+
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
+
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
+
|
154 |
+
#vision_tower.to(torch.float32)
|
155 |
+
# print(next(model.parameters()).device) # cuda:0
|
156 |
+
|
157 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
158 |
+
print(results)
|
159 |
+
|
evaluation/evaluate_mmvp_SigLIP_224.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_SigLIP_224', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
return_length=False,
|
60 |
+
return_overflowing_tokens=False,
|
61 |
+
padding="max_length",
|
62 |
+
return_tensors="pt",
|
63 |
+
)["input_ids"].to(device)
|
64 |
+
text2 = tokenizer(
|
65 |
+
text2,
|
66 |
+
truncation=True,
|
67 |
+
return_length=False,
|
68 |
+
return_overflowing_tokens=False,
|
69 |
+
padding="max_length",
|
70 |
+
return_tensors="pt",
|
71 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
72 |
+
|
73 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
74 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
75 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
76 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
77 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
78 |
+
|
79 |
+
with torch.no_grad():
|
80 |
+
model.eval().float()
|
81 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
82 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
83 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
84 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
85 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
86 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
87 |
+
|
88 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
89 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
90 |
+
|
91 |
+
img1_score1 = probs1[0][0]
|
92 |
+
img1_score2 = probs2[0][0]
|
93 |
+
|
94 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
95 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
96 |
+
|
97 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
98 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
99 |
+
|
100 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
101 |
+
|
102 |
+
current_category = categories[num_pairs // 15]
|
103 |
+
if pred1 == gt1 and pred2 == gt2:
|
104 |
+
pair_accuracies[current_category] += 1
|
105 |
+
num_pairs += 1
|
106 |
+
|
107 |
+
csv_outfile.close()
|
108 |
+
|
109 |
+
# Calculate percentage accuracies
|
110 |
+
Category_Score_List = []
|
111 |
+
|
112 |
+
for category in pair_accuracies:
|
113 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
114 |
+
Category_Score_List.append(pair_accuracies[category])
|
115 |
+
|
116 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
117 |
+
|
118 |
+
return pair_accuracies
|
119 |
+
|
120 |
+
|
121 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
122 |
+
|
123 |
+
with torch.no_grad():
|
124 |
+
clip_model.eval()
|
125 |
+
|
126 |
+
results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
127 |
+
|
128 |
+
# Merge results
|
129 |
+
results = {**results_siglip}
|
130 |
+
|
131 |
+
# Convert results to format suitable for star plot
|
132 |
+
categories = results[list(results.keys())[0]].keys()
|
133 |
+
data = {'Categories': list(categories)}
|
134 |
+
for model in list(results_siglip.keys()):
|
135 |
+
data[model] = [results[model][category] for category in categories]
|
136 |
+
|
137 |
+
return results
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
|
142 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
143 |
+
|
144 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
145 |
+
vision_tower_name = f'SigLIP_224/siglip-so400m-patch14-224-9000'
|
146 |
+
|
147 |
+
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
148 |
+
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
149 |
+
tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
|
150 |
+
|
151 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
152 |
+
print(results)
|
evaluation/evaluate_mmvp_SigLIP_384.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import clip
|
3 |
+
from clip import load
|
4 |
+
import csv
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
import json
|
9 |
+
from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
15 |
+
|
16 |
+
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
|
17 |
+
csv_file = os.path.join(benchmark_dir, 'Questions.csv')
|
18 |
+
|
19 |
+
csv_outfile = open('Prediction_Results_SigLIP_384', 'w', newline='')
|
20 |
+
csv_writer = csv.writer(csv_outfile)
|
21 |
+
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
|
22 |
+
|
23 |
+
categories = [
|
24 |
+
'Orientation and Direction', 'Presence of Specific Features',
|
25 |
+
'State and Condition', 'Quantity and Count',
|
26 |
+
'Positional and Relational Context', 'Color and Appearance',
|
27 |
+
'Structural Characteristics', 'Texts',
|
28 |
+
'Viewpoint and Perspective'
|
29 |
+
]
|
30 |
+
|
31 |
+
pair_accuracies = {category: 0 for category in categories}
|
32 |
+
num_pairs = 0
|
33 |
+
|
34 |
+
with open(csv_file, 'r') as f:
|
35 |
+
reader = csv.reader(f)
|
36 |
+
next(reader) # skip header
|
37 |
+
for i, row in tqdm(enumerate(reader)):
|
38 |
+
qid1, qtype1, statement1 = row
|
39 |
+
|
40 |
+
# Get next row for the pair
|
41 |
+
row = next(reader, None)
|
42 |
+
if not row:
|
43 |
+
break
|
44 |
+
qid2, qtype2, statement2 = row
|
45 |
+
|
46 |
+
qid1, qid2 = int(qid1), int(qid2)
|
47 |
+
|
48 |
+
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
|
49 |
+
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
|
50 |
+
|
51 |
+
text1 = 'a photo of ' + statement1
|
52 |
+
text2 = 'a photo of ' + statement2
|
53 |
+
|
54 |
+
#text1 = clip.tokenize([text1]).to(device)
|
55 |
+
#text2 = clip.tokenize([text2]).to(device)
|
56 |
+
text1 = tokenizer(
|
57 |
+
text1,
|
58 |
+
truncation=True,
|
59 |
+
return_length=False,
|
60 |
+
return_overflowing_tokens=False,
|
61 |
+
padding="max_length",
|
62 |
+
return_tensors="pt",
|
63 |
+
)["input_ids"].to(device)
|
64 |
+
text2 = tokenizer(
|
65 |
+
text2,
|
66 |
+
truncation=True,
|
67 |
+
return_length=False,
|
68 |
+
return_overflowing_tokens=False,
|
69 |
+
padding="max_length",
|
70 |
+
return_tensors="pt",
|
71 |
+
)["input_ids"].to(device) # torch.Size([1, 77])
|
72 |
+
|
73 |
+
#img1 = preprocess(img1).unsqueeze(0).to(device)
|
74 |
+
#img2 = preprocess(img2).unsqueeze(0).to(device)
|
75 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
76 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
77 |
+
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
78 |
+
|
79 |
+
with torch.no_grad():
|
80 |
+
model.eval().float()
|
81 |
+
#logits_per_image1, logits_per_text1 = model(imgs, text1)
|
82 |
+
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
83 |
+
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
84 |
+
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
85 |
+
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
86 |
+
logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
|
87 |
+
|
88 |
+
probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
|
89 |
+
probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
|
90 |
+
|
91 |
+
img1_score1 = probs1[0][0]
|
92 |
+
img1_score2 = probs2[0][0]
|
93 |
+
|
94 |
+
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
95 |
+
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
96 |
+
|
97 |
+
gt1 = "img1" if qid1 % 2 == 1 else "img2"
|
98 |
+
gt2 = "img1" if qid2 % 2 == 1 else "img2"
|
99 |
+
|
100 |
+
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
|
101 |
+
|
102 |
+
current_category = categories[num_pairs // 15]
|
103 |
+
if pred1 == gt1 and pred2 == gt2:
|
104 |
+
pair_accuracies[current_category] += 1
|
105 |
+
num_pairs += 1
|
106 |
+
|
107 |
+
csv_outfile.close()
|
108 |
+
|
109 |
+
# Calculate percentage accuracies
|
110 |
+
Category_Score_List = []
|
111 |
+
|
112 |
+
for category in pair_accuracies:
|
113 |
+
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
|
114 |
+
Category_Score_List.append(pair_accuracies[category])
|
115 |
+
|
116 |
+
pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
|
117 |
+
|
118 |
+
return pair_accuracies
|
119 |
+
|
120 |
+
|
121 |
+
def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
|
122 |
+
|
123 |
+
with torch.no_grad():
|
124 |
+
clip_model.eval()
|
125 |
+
|
126 |
+
results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
|
127 |
+
|
128 |
+
# Merge results
|
129 |
+
results = {**results_siglip}
|
130 |
+
|
131 |
+
# Convert results to format suitable for star plot
|
132 |
+
categories = results[list(results.keys())[0]].keys()
|
133 |
+
data = {'Categories': list(categories)}
|
134 |
+
for model in list(results_siglip.keys()):
|
135 |
+
data[model] = [results[model][category] for category in categories]
|
136 |
+
|
137 |
+
return results
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
|
142 |
+
BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
|
143 |
+
|
144 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
145 |
+
vision_tower_name = f'SigLIP_384/siglip-so400m-patch14-384-7500'
|
146 |
+
|
147 |
+
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
148 |
+
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
149 |
+
tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
|
150 |
+
|
151 |
+
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
152 |
+
print(results)
|