Update evaluation codes
Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py
CHANGED
@@ -7,8 +7,6 @@ import torch
|
|
7 |
from tqdm import tqdm
|
8 |
import json
|
9 |
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
10 |
-
import argparse
|
11 |
-
|
12 |
|
13 |
|
14 |
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
@@ -51,8 +49,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -70,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
70 |
return_overflowing_tokens=False,
|
71 |
padding="max_length",
|
72 |
return_tensors="pt",
|
73 |
-
)["input_ids"].to(device)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
|
81 |
with torch.no_grad():
|
82 |
model.eval().float()
|
83 |
-
|
84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -141,17 +134,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
|
144 |
-
BENCHMARK_DIR = '
|
145 |
|
146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
-
vision_tower_name = f'
|
148 |
|
149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
|
154 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
155 |
print(results)
|
156 |
-
|
157 |
-
|
|
|
7 |
from tqdm import tqdm
|
8 |
import json
|
9 |
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
|
|
|
|
10 |
|
11 |
|
12 |
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
|
49 |
text1 = 'a photo of ' + statement1
|
50 |
text2 = 'a photo of ' + statement2
|
51 |
|
|
|
|
|
52 |
text1 = tokenizer(
|
53 |
text1,
|
54 |
truncation=True,
|
|
|
66 |
return_overflowing_tokens=False,
|
67 |
padding="max_length",
|
68 |
return_tensors="pt",
|
69 |
+
)["input_ids"].to(device)
|
70 |
+
|
71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
74 |
|
75 |
with torch.no_grad():
|
76 |
model.eval().float()
|
77 |
+
|
|
|
78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
134 |
|
135 |
if __name__ == "__main__":
|
136 |
|
137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
138 |
|
139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
140 |
+
vision_tower_name = f'MetaCLIP/metaclip-h14-fullcc2.5b'
|
141 |
|
142 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
143 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
144 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
145 |
|
146 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
147 |
print(results)
|
|
|
|
evaluation/evaluate_mmvp_MetaCLIP_large.py
CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -70,18 +68,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
70 |
return_overflowing_tokens=False,
|
71 |
padding="max_length",
|
72 |
return_tensors="pt",
|
73 |
-
)["input_ids"].to(device)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
|
81 |
with torch.no_grad():
|
82 |
model.eval().float()
|
83 |
-
|
84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -92,7 +87,7 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
92 |
|
93 |
img1_score1 = probs1[0][0]
|
94 |
img1_score2 = probs2[0][0]
|
95 |
-
|
96 |
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
97 |
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
98 |
|
@@ -141,15 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
|
144 |
-
BENCHMARK_DIR = '
|
145 |
|
146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
-
vision_tower_name = '
|
148 |
|
149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
|
154 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
155 |
print(results)
|
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
|
|
|
|
54 |
text1 = tokenizer(
|
55 |
text1,
|
56 |
truncation=True,
|
|
|
68 |
return_overflowing_tokens=False,
|
69 |
padding="max_length",
|
70 |
return_tensors="pt",
|
71 |
+
)["input_ids"].to(device)
|
72 |
+
|
73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
75 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
76 |
|
77 |
with torch.no_grad():
|
78 |
model.eval().float()
|
79 |
+
|
|
|
80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
87 |
|
88 |
img1_score1 = probs1[0][0]
|
89 |
img1_score2 = probs2[0][0]
|
90 |
+
|
91 |
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
92 |
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
93 |
|
|
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
|
139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
140 |
|
141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
142 |
+
vision_tower_name = 'MetaCLIP/metaclip-l14-fullcc2.5b'
|
143 |
|
144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
147 |
|
148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
149 |
print(results)
|
evaluation/evaluate_mmvp_OpenAICLIP_224.py
CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
71 |
padding="max_length",
|
72 |
return_tensors="pt",
|
73 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
|
81 |
with torch.no_grad():
|
82 |
model.eval().float()
|
83 |
-
|
84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
|
144 |
-
BENCHMARK_DIR = '
|
145 |
|
146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
-
vision_tower_name = f'
|
148 |
|
149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
-
|
154 |
-
#vision_tower.to(torch.float32)
|
155 |
-
# print(next(model.parameters()).device) # cuda:0
|
156 |
|
157 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
158 |
print(results)
|
159 |
-
|
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
|
|
|
|
54 |
text1 = tokenizer(
|
55 |
text1,
|
56 |
truncation=True,
|
|
|
69 |
padding="max_length",
|
70 |
return_tensors="pt",
|
71 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
72 |
+
|
73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
|
|
|
|
75 |
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
76 |
|
77 |
with torch.no_grad():
|
78 |
model.eval().float()
|
79 |
+
|
|
|
80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
|
139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
140 |
|
141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
142 |
+
vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14'
|
143 |
|
144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
|
|
|
|
|
147 |
|
148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
149 |
print(results)
|
|
evaluation/evaluate_mmvp_OpenAICLIP_336.py
CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
71 |
padding="max_length",
|
72 |
return_tensors="pt",
|
73 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
80 |
|
81 |
with torch.no_grad():
|
82 |
model.eval().float()
|
83 |
-
|
84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
|
144 |
-
BENCHMARK_DIR = '
|
145 |
|
146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
-
vision_tower_name = f'
|
148 |
|
149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
153 |
-
|
154 |
-
#vision_tower.to(torch.float32)
|
155 |
-
# print(next(model.parameters()).device) # cuda:0
|
156 |
|
157 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
158 |
print(results)
|
159 |
-
|
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
|
|
|
|
54 |
text1 = tokenizer(
|
55 |
text1,
|
56 |
truncation=True,
|
|
|
69 |
padding="max_length",
|
70 |
return_tensors="pt",
|
71 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
72 |
+
|
73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
75 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
76 |
|
77 |
with torch.no_grad():
|
78 |
model.eval().float()
|
79 |
+
|
|
|
80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
|
139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
140 |
|
141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
142 |
+
vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14-336'
|
143 |
|
144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
|
|
|
|
|
147 |
|
148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
149 |
print(results)
|
|
evaluation/evaluate_mmvp_SigLIP_224.py
CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
68 |
return_overflowing_tokens=False,
|
69 |
padding="max_length",
|
70 |
return_tensors="pt",
|
71 |
-
)["input_ids"].to(device)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
77 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
78 |
|
79 |
with torch.no_grad():
|
80 |
model.eval().float()
|
81 |
-
|
82 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
83 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
84 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
85 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
139 |
|
140 |
if __name__ == "__main__":
|
141 |
|
142 |
-
BENCHMARK_DIR = '
|
143 |
|
144 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
145 |
-
vision_tower_name = f'
|
146 |
|
147 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
148 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
|
|
|
|
54 |
text1 = tokenizer(
|
55 |
text1,
|
56 |
truncation=True,
|
|
|
66 |
return_overflowing_tokens=False,
|
67 |
padding="max_length",
|
68 |
return_tensors="pt",
|
69 |
+
)["input_ids"].to(device)
|
70 |
+
|
71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
74 |
|
75 |
with torch.no_grad():
|
76 |
model.eval().float()
|
77 |
+
|
|
|
78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
134 |
|
135 |
if __name__ == "__main__":
|
136 |
|
137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
138 |
|
139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
140 |
+
vision_tower_name = f'SigLIP/siglip-so400m-patch14-224'
|
141 |
|
142 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
143 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
evaluation/evaluate_mmvp_SigLIP_384.py
CHANGED
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
56 |
text1 = tokenizer(
|
57 |
text1,
|
58 |
truncation=True,
|
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
68 |
return_overflowing_tokens=False,
|
69 |
padding="max_length",
|
70 |
return_tensors="pt",
|
71 |
-
)["input_ids"].to(device)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
77 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
78 |
|
79 |
with torch.no_grad():
|
80 |
model.eval().float()
|
81 |
-
|
82 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
83 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
84 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
85 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
139 |
|
140 |
if __name__ == "__main__":
|
141 |
|
142 |
-
BENCHMARK_DIR = '
|
143 |
|
144 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
145 |
-
vision_tower_name = f'
|
146 |
|
147 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
148 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
|
|
51 |
text1 = 'a photo of ' + statement1
|
52 |
text2 = 'a photo of ' + statement2
|
53 |
|
|
|
|
|
54 |
text1 = tokenizer(
|
55 |
text1,
|
56 |
truncation=True,
|
|
|
66 |
return_overflowing_tokens=False,
|
67 |
padding="max_length",
|
68 |
return_tensors="pt",
|
69 |
+
)["input_ids"].to(device)
|
70 |
+
|
71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
74 |
|
75 |
with torch.no_grad():
|
76 |
model.eval().float()
|
77 |
+
|
|
|
78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
134 |
|
135 |
if __name__ == "__main__":
|
136 |
|
137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
138 |
|
139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
140 |
+
vision_tower_name = f'SigLIP/siglip-so400m-patch14-384'
|
141 |
|
142 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
143 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|