import gradio as gr from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration from PIL import Image # Load model and processor model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large") processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") # Define the function def describe_ui(image): inputs = processor(images=image, return_tensors="pt") outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) # Launch the Gradio interface gr.Interface( fn=describe_ui, inputs=gr.Image(type="pil"), outputs="text", title="UI Screen Describer (Pix2Struct)", description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model." ).launch()