Ahmadzei's picture
added 3 more tables for large emb model
5fa1a76
thon
import requests
import torch
from PIL import Image
from transformers import AlignProcessor, AlignModel
processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
model = AlignModel.from_pretrained("kakaobrain/align-base")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
candidate_labels = ["an image of a cat", "an image of a dog"]
inputs = processor(text=candidate_labels, images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
this is the image-text similarity score
logits_per_image = outputs.logits_per_image
we can take the softmax to get the label probabilities
probs = logits_per_image.softmax(dim=1)
print(probs)
Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.