import torch | |
from transformers import AutoProcessor | |
from transformers import AutoModelForDocumentQuestionAnswering | |
processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") | |
model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") | |
with torch.no_grad(): | |
encoding = processor(image.convert("RGB"), question, return_tensors="pt") | |
outputs = model(**encoding) | |
start_logits = outputs.start_logits | |
end_logits = outputs.end_logits | |
predicted_start_idx = start_logits.argmax(-1).item() | |
predicted_end_idx = end_logits.argmax(-1).item() | |
processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]) | |
'lee a. waller' |