thon
import requests
import torch
from PIL import Image
from transformers import AlignProcessor, AlignModel
processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
model = AlignModel.from_pretrained("kakaobrain/align-base")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
candidate_labels = ["an image of a cat", "an image of a dog"]
inputs = processor(text=candidate_labels, images=image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
this is the image-text similarity score
logits_per_image = outputs.logits_per_image
we can take the softmax to get the label probabilities
probs = logits_per_image.softmax(dim=1)
print(probs)

Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.