import os import gradio as gr import pandas as pd import numpy as np from apscheduler.schedulers.background import BackgroundScheduler # InfoStrings from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION TOKEN = os.environ.get("TOKEN", None) OWNER="Online-Mind2Web" # api = HfApi() YEAR_VERSION = "2025" LOCAL_DEBUG = True # Display the results def get_dataframe_from_results(eval_path): df = pd.read_csv(eval_path) df = df.sort_values(by=["Average SR"], ascending=False) for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']: df[format_column] = df[format_column].map('{:.1f}'.format) # df["Average SR"] = df["Average SR"].map('{:.1f}'.format) return df auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"] def refresh(): auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') return auto_eval_dataframe_test, human_eval_dataframe_test def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks(css=""" #human-leaderboard-table { width: auto; /* allow auto sizing */ min-width: calc(100% + 20px); /* extend a little beyond the content */ } """) with demo: gr.HTML(TITLE) gr.HTML(LINKS) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", lines=10, ) gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text") # gr.HTML(LEADERBOARD_HTML) with gr.Tab("Human Evaluation", elem_id="human-tab", id=1): human_leaderboard_table_test = gr.components.Dataframe( value=human_eval_dataframe_test, datatype=TYPES, interactive=False, # column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"] # interactive=False, # height=700, # column_widths=[190, 140, 75, 75, 50, 50, 50, 50, 75], wrap=False ) with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2): auto_leaderboard_table_test = gr.components.Dataframe( value=auto_eval_dataframe_test, datatype=TYPES, interactive=False, wrap=False # column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"] ) with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ auto_leaderboard_table_test, human_leaderboard_table_test, ], ) scheduler = BackgroundScheduler() scheduler.start() demo.launch(debug=True)