thon import requests import torch from PIL import Image from transformers import AlignProcessor, AlignModel processor = AlignProcessor.from_pretrained("kakaobrain/align-base") model = AlignModel.from_pretrained("kakaobrain/align-base") url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) candidate_labels = ["an image of a cat", "an image of a dog"] inputs = processor(text=candidate_labels, images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) this is the image-text similarity score logits_per_image = outputs.logits_per_image we can take the softmax to get the label probabilities probs = logits_per_image.softmax(dim=1) print(probs) Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.