thon | |
import requests | |
import torch | |
from PIL import Image | |
from transformers import AlignProcessor, AlignModel | |
processor = AlignProcessor.from_pretrained("kakaobrain/align-base") | |
model = AlignModel.from_pretrained("kakaobrain/align-base") | |
url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
image = Image.open(requests.get(url, stream=True).raw) | |
candidate_labels = ["an image of a cat", "an image of a dog"] | |
inputs = processor(text=candidate_labels, images=image, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
this is the image-text similarity score | |
logits_per_image = outputs.logits_per_image | |
we can take the softmax to get the label probabilities | |
probs = logits_per_image.softmax(dim=1) | |
print(probs) | |
Resources | |
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN. |