thon | |
import requests | |
from PIL import Image | |
from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel | |
load a fine-tuned image captioning model and corresponding tokenizer and image processor | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
let's perform inference on an image | |
url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
image = Image.open(requests.get(url, stream=True).raw) | |
pixel_values = image_processor(image, return_tensors="pt").pixel_values | |
autoregressively generate caption (uses greedy decoding by default) | |
generated_ids = model.generate(pixel_values) | |
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
print(generated_text) | |
a cat laying on a blanket next to a cat laying on a bed | |
Loading a PyTorch checkpoint into TFVisionEncoderDecoderModel. |