You can use these models as follows (example on a ChatQA dataset): | |
thon | |
from transformers import AutoProcessor, Pix2StructForConditionalGeneration | |
import requests | |
from PIL import Image | |
model = Pix2StructForConditionalGeneration.from_pretrained("google/matcha-chartqa").to(0) | |
processor = AutoProcessor.from_pretrained("google/matcha-chartqa") | |
url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png" | |
image = Image.open(requests.get(url, stream=True).raw) | |
inputs = processor(images=image, text="Is the sum of all 4 places greater than Laos? |