Image Feature Extraction
Transformers
msj9817 commited on
Commit
92eedf0
·
verified ·
1 Parent(s): f661bdf

Update evaluation codes

Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_MetaCLIP_huge', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ max_length=77,
60
+ return_length=False,
61
+ return_overflowing_tokens=False,
62
+ padding="max_length",
63
+ return_tensors="pt",
64
+ )["input_ids"].to(device)
65
+ text2 = tokenizer(
66
+ text2,
67
+ truncation=True,
68
+ max_length=77,
69
+ return_length=False,
70
+ return_overflowing_tokens=False,
71
+ padding="max_length",
72
+ return_tensors="pt",
73
+ )["input_ids"].to(device) # torch.Size([1, 77])
74
+
75
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
76
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
77
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
+
81
+ with torch.no_grad():
82
+ model.eval().float()
83
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
86
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
88
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
89
+
90
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
91
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
92
+
93
+ img1_score1 = probs1[0][0]
94
+ img1_score2 = probs2[0][0]
95
+
96
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
97
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
98
+
99
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
100
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
101
+
102
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
103
+
104
+ current_category = categories[num_pairs // 15]
105
+ if pred1 == gt1 and pred2 == gt2:
106
+ pair_accuracies[current_category] += 1
107
+ num_pairs += 1
108
+
109
+ csv_outfile.close()
110
+
111
+ # Calculate percentage accuracies
112
+ Category_Score_List = []
113
+
114
+ for category in pair_accuracies:
115
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
116
+ Category_Score_List.append(pair_accuracies[category])
117
+
118
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
119
+
120
+ return pair_accuracies
121
+
122
+
123
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
124
+
125
+ with torch.no_grad():
126
+ clip_model.eval()
127
+
128
+ results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
129
+
130
+ # Merge results
131
+ results = {**results_openai}
132
+
133
+ # Convert results to format suitable for star plot
134
+ categories = results[list(results.keys())[0]].keys()
135
+ data = {'Categories': list(categories)}
136
+ for model in list(results_openai.keys()):
137
+ data[model] = [results[model][category] for category in categories]
138
+
139
+ return results
140
+
141
+
142
+ if __name__ == "__main__":
143
+
144
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
+
146
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
+ vision_tower_name = f'MetaCLIP_huge/metaclip-h14-fullcc2.5b-6000'
148
+
149
+ vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
+ image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
+ tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
+ #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
+
154
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
155
+ print(results)
156
+
157
+
evaluation/evaluate_mmvp_MetaCLIP_large.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_MetaCLIP_large', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ max_length=77,
60
+ return_length=False,
61
+ return_overflowing_tokens=False,
62
+ padding="max_length",
63
+ return_tensors="pt",
64
+ )["input_ids"].to(device)
65
+ text2 = tokenizer(
66
+ text2,
67
+ truncation=True,
68
+ max_length=77,
69
+ return_length=False,
70
+ return_overflowing_tokens=False,
71
+ padding="max_length",
72
+ return_tensors="pt",
73
+ )["input_ids"].to(device) # torch.Size([1, 77])
74
+
75
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
76
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
77
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
+
81
+ with torch.no_grad():
82
+ model.eval().float()
83
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
86
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
88
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
89
+
90
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
91
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
92
+
93
+ img1_score1 = probs1[0][0]
94
+ img1_score2 = probs2[0][0]
95
+
96
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
97
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
98
+
99
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
100
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
101
+
102
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
103
+
104
+ current_category = categories[num_pairs // 15]
105
+ if pred1 == gt1 and pred2 == gt2:
106
+ pair_accuracies[current_category] += 1
107
+ num_pairs += 1
108
+
109
+ csv_outfile.close()
110
+
111
+ # Calculate percentage accuracies
112
+ Category_Score_List = []
113
+
114
+ for category in pair_accuracies:
115
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
116
+ Category_Score_List.append(pair_accuracies[category])
117
+
118
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
119
+
120
+ return pair_accuracies
121
+
122
+
123
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
124
+
125
+ with torch.no_grad():
126
+ clip_model.eval()
127
+
128
+ results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
129
+
130
+ # Merge results
131
+ results = {**results_openai}
132
+
133
+ # Convert results to format suitable for star plot
134
+ categories = results[list(results.keys())[0]].keys()
135
+ data = {'Categories': list(categories)}
136
+ for model in list(results_openai.keys()):
137
+ data[model] = [results[model][category] for category in categories]
138
+
139
+ return results
140
+
141
+
142
+ if __name__ == "__main__":
143
+
144
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
+
146
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
+ vision_tower_name = 'MetaCLIP_large/metaclip-l14-fullcc2.5b-7000'
148
+
149
+ vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
+ image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
+ tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
+ #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
+
154
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
155
+ print(results)
156
+
157
+
evaluation/evaluate_mmvp_OpenAICLIP_224.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ max_length=77,
60
+ return_length=False,
61
+ return_overflowing_tokens=False,
62
+ padding="max_length",
63
+ return_tensors="pt",
64
+ )["input_ids"].to(device)
65
+ text2 = tokenizer(
66
+ text2,
67
+ truncation=True,
68
+ max_length=77,
69
+ return_length=False,
70
+ return_overflowing_tokens=False,
71
+ padding="max_length",
72
+ return_tensors="pt",
73
+ )["input_ids"].to(device) # torch.Size([1, 77])
74
+
75
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
76
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
77
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
+
81
+ with torch.no_grad():
82
+ model.eval().float()
83
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
86
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
88
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
89
+
90
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
91
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
92
+
93
+ img1_score1 = probs1[0][0]
94
+ img1_score2 = probs2[0][0]
95
+
96
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
97
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
98
+
99
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
100
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
101
+
102
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
103
+
104
+ current_category = categories[num_pairs // 15]
105
+ if pred1 == gt1 and pred2 == gt2:
106
+ pair_accuracies[current_category] += 1
107
+ num_pairs += 1
108
+
109
+ csv_outfile.close()
110
+
111
+ # Calculate percentage accuracies
112
+ Category_Score_List = []
113
+
114
+ for category in pair_accuracies:
115
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
116
+ Category_Score_List.append(pair_accuracies[category])
117
+
118
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
119
+
120
+ return pair_accuracies
121
+
122
+
123
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
124
+
125
+ with torch.no_grad():
126
+ clip_model.eval()
127
+
128
+ results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
129
+
130
+ # Merge results
131
+ results = {**results_openai}
132
+
133
+ # Convert results to format suitable for star plot
134
+ categories = results[list(results.keys())[0]].keys()
135
+ data = {'Categories': list(categories)}
136
+ for model in list(results_openai.keys()):
137
+ data[model] = [results[model][category] for category in categories]
138
+
139
+ return results
140
+
141
+
142
+ if __name__ == "__main__":
143
+
144
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
+
146
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
+ vision_tower_name = f'OpenAICLIP_224/clip-vit-large-patch14-all-lr5-3000-res384'
148
+
149
+ vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
+ image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
+ tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
+ #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
+
154
+ #vision_tower.to(torch.float32)
155
+ # print(next(model.parameters()).device) # cuda:0
156
+
157
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
158
+ print(results)
159
+
evaluation/evaluate_mmvp_OpenAICLIP_336.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_OpenAICLIP', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ max_length=77,
60
+ return_length=False,
61
+ return_overflowing_tokens=False,
62
+ padding="max_length",
63
+ return_tensors="pt",
64
+ )["input_ids"].to(device)
65
+ text2 = tokenizer(
66
+ text2,
67
+ truncation=True,
68
+ max_length=77,
69
+ return_length=False,
70
+ return_overflowing_tokens=False,
71
+ padding="max_length",
72
+ return_tensors="pt",
73
+ )["input_ids"].to(device) # torch.Size([1, 77])
74
+
75
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
76
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
77
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
78
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
79
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
80
+
81
+ with torch.no_grad():
82
+ model.eval().float()
83
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
84
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
85
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
86
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
87
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
88
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
89
+
90
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
91
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
92
+
93
+ img1_score1 = probs1[0][0]
94
+ img1_score2 = probs2[0][0]
95
+
96
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
97
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
98
+
99
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
100
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
101
+
102
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
103
+
104
+ current_category = categories[num_pairs // 15]
105
+ if pred1 == gt1 and pred2 == gt2:
106
+ pair_accuracies[current_category] += 1
107
+ num_pairs += 1
108
+
109
+ csv_outfile.close()
110
+
111
+ # Calculate percentage accuracies
112
+ Category_Score_List = []
113
+
114
+ for category in pair_accuracies:
115
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
116
+ Category_Score_List.append(pair_accuracies[category])
117
+
118
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
119
+
120
+ return pair_accuracies
121
+
122
+
123
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
124
+
125
+ with torch.no_grad():
126
+ clip_model.eval()
127
+
128
+ results_openai = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
129
+
130
+ # Merge results
131
+ results = {**results_openai}
132
+
133
+ # Convert results to format suitable for star plot
134
+ categories = results[list(results.keys())[0]].keys()
135
+ data = {'Categories': list(categories)}
136
+ for model in list(results_openai.keys()):
137
+ data[model] = [results[model][category] for category in categories]
138
+
139
+ return results
140
+
141
+
142
+ if __name__ == "__main__":
143
+
144
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
145
+
146
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
+ vision_tower_name = f'OpenAICLIP_336/clip-vit-large-patch14-336-all-lr5-3500-512-tokens'
148
+
149
+ vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
150
+ image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
151
+ tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
152
+ #processor = CLIPProcessor.from_pretrained(vision_tower_name)
153
+
154
+ #vision_tower.to(torch.float32)
155
+ # print(next(model.parameters()).device) # cuda:0
156
+
157
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
158
+ print(results)
159
+
evaluation/evaluate_mmvp_SigLIP_224.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_SigLIP_224', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ return_length=False,
60
+ return_overflowing_tokens=False,
61
+ padding="max_length",
62
+ return_tensors="pt",
63
+ )["input_ids"].to(device)
64
+ text2 = tokenizer(
65
+ text2,
66
+ truncation=True,
67
+ return_length=False,
68
+ return_overflowing_tokens=False,
69
+ padding="max_length",
70
+ return_tensors="pt",
71
+ )["input_ids"].to(device) # torch.Size([1, 77])
72
+
73
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
74
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
75
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
76
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
77
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
78
+
79
+ with torch.no_grad():
80
+ model.eval().float()
81
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
82
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
83
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
84
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
85
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
86
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
87
+
88
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
89
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
90
+
91
+ img1_score1 = probs1[0][0]
92
+ img1_score2 = probs2[0][0]
93
+
94
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
95
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
96
+
97
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
98
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
99
+
100
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
101
+
102
+ current_category = categories[num_pairs // 15]
103
+ if pred1 == gt1 and pred2 == gt2:
104
+ pair_accuracies[current_category] += 1
105
+ num_pairs += 1
106
+
107
+ csv_outfile.close()
108
+
109
+ # Calculate percentage accuracies
110
+ Category_Score_List = []
111
+
112
+ for category in pair_accuracies:
113
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
114
+ Category_Score_List.append(pair_accuracies[category])
115
+
116
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
117
+
118
+ return pair_accuracies
119
+
120
+
121
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
122
+
123
+ with torch.no_grad():
124
+ clip_model.eval()
125
+
126
+ results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
127
+
128
+ # Merge results
129
+ results = {**results_siglip}
130
+
131
+ # Convert results to format suitable for star plot
132
+ categories = results[list(results.keys())[0]].keys()
133
+ data = {'Categories': list(categories)}
134
+ for model in list(results_siglip.keys()):
135
+ data[model] = [results[model][category] for category in categories]
136
+
137
+ return results
138
+
139
+
140
+ if __name__ == "__main__":
141
+
142
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
143
+
144
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145
+ vision_tower_name = f'SigLIP_224/siglip-so400m-patch14-224-9000'
146
+
147
+ vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
148
+ image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
149
+ tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
150
+
151
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
152
+ print(results)
evaluation/evaluate_mmvp_SigLIP_384.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import clip
3
+ from clip import load
4
+ import csv
5
+ from PIL import Image
6
+ import torch
7
+ from tqdm import tqdm
8
+ import json
9
+ from transformers import SiglipProcessor, SiglipModel, SiglipImageProcessor, SiglipTokenizer
10
+ import argparse
11
+
12
+
13
+
14
+ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
15
+
16
+ image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images')
17
+ csv_file = os.path.join(benchmark_dir, 'Questions.csv')
18
+
19
+ csv_outfile = open('Prediction_Results_SigLIP_384', 'w', newline='')
20
+ csv_writer = csv.writer(csv_outfile)
21
+ csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header
22
+
23
+ categories = [
24
+ 'Orientation and Direction', 'Presence of Specific Features',
25
+ 'State and Condition', 'Quantity and Count',
26
+ 'Positional and Relational Context', 'Color and Appearance',
27
+ 'Structural Characteristics', 'Texts',
28
+ 'Viewpoint and Perspective'
29
+ ]
30
+
31
+ pair_accuracies = {category: 0 for category in categories}
32
+ num_pairs = 0
33
+
34
+ with open(csv_file, 'r') as f:
35
+ reader = csv.reader(f)
36
+ next(reader) # skip header
37
+ for i, row in tqdm(enumerate(reader)):
38
+ qid1, qtype1, statement1 = row
39
+
40
+ # Get next row for the pair
41
+ row = next(reader, None)
42
+ if not row:
43
+ break
44
+ qid2, qtype2, statement2 = row
45
+
46
+ qid1, qid2 = int(qid1), int(qid2)
47
+
48
+ img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg'))
49
+ img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg'))
50
+
51
+ text1 = 'a photo of ' + statement1
52
+ text2 = 'a photo of ' + statement2
53
+
54
+ #text1 = clip.tokenize([text1]).to(device)
55
+ #text2 = clip.tokenize([text2]).to(device)
56
+ text1 = tokenizer(
57
+ text1,
58
+ truncation=True,
59
+ return_length=False,
60
+ return_overflowing_tokens=False,
61
+ padding="max_length",
62
+ return_tensors="pt",
63
+ )["input_ids"].to(device)
64
+ text2 = tokenizer(
65
+ text2,
66
+ truncation=True,
67
+ return_length=False,
68
+ return_overflowing_tokens=False,
69
+ padding="max_length",
70
+ return_tensors="pt",
71
+ )["input_ids"].to(device) # torch.Size([1, 77])
72
+
73
+ #img1 = preprocess(img1).unsqueeze(0).to(device)
74
+ #img2 = preprocess(img2).unsqueeze(0).to(device)
75
+ img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
76
+ img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
77
+ imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
78
+
79
+ with torch.no_grad():
80
+ model.eval().float()
81
+ #logits_per_image1, logits_per_text1 = model(imgs, text1)
82
+ #logits_per_image2, logits_per_text2 = model(imgs, text2)
83
+ outputs1 = model(input_ids=text1, pixel_values=imgs)
84
+ logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
85
+ outputs2 = model(input_ids=text2, pixel_values=imgs)
86
+ logits_per_image2, logits_per_text2 = outputs2.logits_per_image, outputs2.logits_per_text
87
+
88
+ probs1 = logits_per_text1.softmax(dim=-1).cpu().numpy()
89
+ probs2 = logits_per_text2.softmax(dim=-1).cpu().numpy()
90
+
91
+ img1_score1 = probs1[0][0]
92
+ img1_score2 = probs2[0][0]
93
+
94
+ pred1 = "img1" if img1_score1 > 0.5 else "img2"
95
+ pred2 = "img1" if img1_score2 > 0.5 else "img2"
96
+
97
+ gt1 = "img1" if qid1 % 2 == 1 else "img2"
98
+ gt2 = "img1" if qid2 % 2 == 1 else "img2"
99
+
100
+ csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2])
101
+
102
+ current_category = categories[num_pairs // 15]
103
+ if pred1 == gt1 and pred2 == gt2:
104
+ pair_accuracies[current_category] += 1
105
+ num_pairs += 1
106
+
107
+ csv_outfile.close()
108
+
109
+ # Calculate percentage accuracies
110
+ Category_Score_List = []
111
+
112
+ for category in pair_accuracies:
113
+ pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100
114
+ Category_Score_List.append(pair_accuracies[category])
115
+
116
+ pair_accuracies['average_score'] = sum(Category_Score_List)/len(Category_Score_List)
117
+
118
+ return pair_accuracies
119
+
120
+
121
+ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_dir, device):
122
+
123
+ with torch.no_grad():
124
+ clip_model.eval()
125
+
126
+ results_siglip = {f'{model_name}': benchmark_model(processor, tokenizer, clip_model, benchmark_dir, device)}
127
+
128
+ # Merge results
129
+ results = {**results_siglip}
130
+
131
+ # Convert results to format suitable for star plot
132
+ categories = results[list(results.keys())[0]].keys()
133
+ data = {'Categories': list(categories)}
134
+ for model in list(results_siglip.keys()):
135
+ data[model] = [results[model][category] for category in categories]
136
+
137
+ return results
138
+
139
+
140
+ if __name__ == "__main__":
141
+
142
+ BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
143
+
144
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145
+ vision_tower_name = f'SigLIP_384/siglip-so400m-patch14-384-7500'
146
+
147
+ vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
148
+ image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
149
+ tokenizer = SiglipTokenizer.from_pretrained(vision_tower_name)
150
+
151
+ results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
152
+ print(results)