Valentin Buchner commited on
Commit
5d1d0b5
·
1 Parent(s): 0724c4e

huggingface leaderboard and github action to update markdown leaderboard from json

Browse files
.github/scripts/generate-leaderboard.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ def generate_markdown(data):
5
+ markdown = """<div align="center">\n\n"""
6
+ markdown += "# 🔥🏅️GenCeption Leaderboard 🏅️🔥\n\n"
7
+ markdown += """\n\n</div>\n\n"""
8
+ markdown += "#### GC@3 scores for different models and categories:\n"
9
+ markdown += "| Model | **Mean** | Exist. | Count | Posi. | Col. | Post. | Cel. | Sce. | Lan. | Art. | Comm. | **Vis Mean** | Code | Num. | Tran. | OCR | **Text Mean** |\n"
10
+ markdown += (
11
+ "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n"
12
+ )
13
+ for model in data["models"]:
14
+ scores = model["scores"]
15
+ markdown += f"| [{model['name']}]({model['url']}) "
16
+ for score_key in [
17
+ "Mean",
18
+ "Exist",
19
+ "Count",
20
+ "Posi",
21
+ "Col",
22
+ "Post",
23
+ "Cel",
24
+ "Sce",
25
+ "Lan",
26
+ "Art",
27
+ "Comm",
28
+ "VisMean",
29
+ "Code",
30
+ "Num",
31
+ "Tran",
32
+ "OCR",
33
+ "TextMean",
34
+ ]:
35
+ if "Mean" in score_key:
36
+ markdown += f"| **{scores[score_key]}** "
37
+ else:
38
+ markdown += f"| {scores[score_key]} "
39
+ markdown += "|\n"
40
+ markdown += """\n\nLegend:
41
+ - Exist.: Existence
42
+ - Count: Count
43
+ - Posi.: Position
44
+ - Col.: Color
45
+ - Post.: Poster
46
+ - Cel.: Celebrity
47
+ - Sce.: Scene
48
+ - Lan.: Landmark
49
+ - Art.: Artwork
50
+ - Com. R.: Commonsense Reasoning
51
+ - Code: Code Reasoning
52
+ - Num.: Numerical Calculation
53
+ - Tran.: Text Translation
54
+ - OCR: OCR"""
55
+ return markdown
56
+
57
+
58
+ if __name__ == "__main__":
59
+ with open("leaderboard/leaderboard.json", "r") as f:
60
+ data = json.load(f)
61
+ markdown_content = generate_markdown(data)
62
+ with open("leaderboard/Leaderboard.md", "w") as f:
63
+ f.write(markdown_content)
.github/workflows/update-leaderboard.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Update Markdown from JSON
2
+
3
+ on:
4
+ push:
5
+ paths:
6
+ - 'leaderboard/leaderboard.json'
7
+
8
+ jobs:
9
+ update-markdown:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v2
15
+ with:
16
+ python-version: '3.x'
17
+ - name: Install dependencies
18
+ run: |
19
+ python -m pip install --upgrade pip
20
+ pip install pyyaml
21
+ - name: Generate Markdown
22
+ run: python .github/scripts/generate_leaderboard.py
23
+ - name: Commit and push if changed
24
+ run: |
25
+ git config --global user.email "action@github.com"
26
+ git config --global user.name "GitHub Action"
27
+ git add README.md
28
+ git commit -m "Update README.md from JSON data" -a || echo "No changes to commit"
29
+ git push
Leaderboard.md DELETED
@@ -1,163 +0,0 @@
1
- # 🔥🏅️GenCeption Leaderboard 🏅️🔥
2
-
3
- Evaluated MLLMs: [ChatGPT-4V](https://cdn.openai.com/papers/GPTV_System_Card.pdf), [mPLUG-Owl2](https://arxiv.org/pdf/2311.04257.pdf), [LLaVA-13B](https://arxiv.org/pdf/2304.08485.pdf), [LLaVA-7B](https://arxiv.org/pdf/2304.08485.pdf)
4
-
5
- <table>
6
- <tr><th>Existence </th><th>Count</th></tr>
7
- <tr><td>
8
-
9
- | Model | GC@3|
10
- |--|--|
11
- | ChatGPT-4V|0.422 |
12
- | mPLUG-Owl2|0.323 |
13
- | LLaVA-7B|0.308 |
14
- | LLaVA-13B|0.305 |
15
-
16
- </td><td>
17
-
18
- | Model | GC@3|
19
- |--|--|
20
- | ChatGPT-4V|0.404 |
21
- | mPLUG-Owl2|0.299 |
22
- | LLaVA-13B|0.294 |
23
- | LLaVA-7B|0.353 |
24
-
25
- </td></tr> </table>
26
-
27
-
28
- <table>
29
- <tr><th>Position </th><th>Color</th></tr>
30
- <tr><td>
31
-
32
- | Model | GC@3|
33
- |--|--|
34
- | ChatGPT-4V|0.408|
35
- | mPLUG-Owl2|0.306 |
36
- | LLaVA-7B|0.285 |
37
- | LLaVA-13B|0.255 |
38
-
39
- </td><td>
40
-
41
- | Model | GC@3|
42
- |--|--|
43
- | ChatGPT-4V|0.403 |
44
- | LLaVA-13B|0.300 |
45
- | mPLUG-Owl2|0.290 |
46
- | LLaVA-7B|0.284 |
47
-
48
- </td></tr> </table>
49
-
50
-
51
- <table>
52
- <tr><th>Poster </th><th>Celebrity</th></tr>
53
- <tr><td>
54
-
55
- | Model | GC@3|
56
- |--|--|
57
- | ChatGPT-4V|0.324|
58
- | mPLUG-Owl2|0.243 |
59
- | LLaVA-13B|0.215 |
60
- | LLaVA-7B|0.214 |
61
-
62
- </td><td>
63
-
64
- | Model | GC@3|
65
- |--|--|
66
- | ChatGPT-4V|0.332 |
67
- | mPLUG-Owl2|0.232 |
68
- | LLaVA-13B|0.206 |
69
- | LLaVA-7B|0.188 |
70
-
71
- </td></tr> </table>
72
-
73
-
74
- <table>
75
- <tr><th>Scene </th><th>Landmark</th></tr>
76
- <tr><td>
77
-
78
- | Model | GC@3|
79
- |--|--|
80
- | ChatGPT-4V|0.393|
81
- | mPLUG-Owl2|0.299 |
82
- | LLaVA-13B|0.277 |
83
- | LLaVA-7B|0.266 |
84
-
85
- </td><td>
86
-
87
- | Model | GC@3|
88
- |--|--|
89
- | ChatGPT-4V|0.353 |
90
- | mPLUG-Owl2|0.275 |
91
- | LLaVA-7B|0.252 |
92
- | LLaVA-13B|0.242 |
93
-
94
- </td></tr> </table>
95
-
96
-
97
- <table>
98
- <tr><th>Artwork </th><th>Commonsense Reasoning</th></tr>
99
- <tr><td>
100
-
101
- | Model | GC@3|
102
- |--|--|
103
- | ChatGPT-4V|0.421|
104
- | mPLUG-Owl2|0.252 |
105
- | LLaVA-13B|0.212 |
106
- | LLaVA-7B|0.210 |
107
-
108
- </td><td>
109
-
110
- | Model | GC@3|
111
- |--|--|
112
- | ChatGPT-4V|0.471 |
113
- | mPLUG-Owl2|0.353 |
114
- | LLaVA-13B|0.334 |
115
- | LLaVA-7B|0.294 |
116
-
117
- </td></tr> </table>
118
-
119
-
120
- <table>
121
- <tr><th>Code Reasoning </th><th>Numerical Calculation</th></tr>
122
- <tr><td>
123
-
124
- | Model | GC@3|
125
- |--|--|
126
- | ChatGPT-4V|0.193|
127
- | mPLUG-Owl2|0.176 |
128
- | LLaVA-13B|0.144 |
129
- | LLaVA-7B|0.107 |
130
-
131
- </td><td>
132
-
133
- | Model | GC@3|
134
- |--|--|
135
- | ChatGPT-4V|0.240 |
136
- | LLaVA-13B|0.195 |
137
- | mPLUG-Owl2|0.192 |
138
- | LLaVA-7B|0.155 |
139
-
140
- </td></tr> </table>
141
-
142
-
143
- <table>
144
- <tr><th>Text Translation </th><th>OCR</th></tr>
145
- <tr><td>
146
-
147
- | Model | GC@3|
148
- |--|--|
149
- | ChatGPT-4V|0.157|
150
- | LLaVA-13B|0.116 |
151
- | LLaVA-7B|0.111 |
152
- | mPLUG-Owl2|0.081 |
153
-
154
- </td><td>
155
-
156
- | Model | GC@3|
157
- |--|--|
158
- | ChatGPT-4V|0.393 |
159
- | mPLUG-Owl2|0.276 |
160
- | LLaVA-13B|0.239 |
161
- | LLaVA-7B|0.222 |
162
-
163
- </td></tr> </table>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -22,7 +22,7 @@ We demostrate a 5-iteration GenCeption procedure below run on a seed images to e
22
 
23
 
24
  ## Contribute
25
- Please **create PR (Pull-Request)** to contribute your results to the [🔥🏅️**Leaderboard**🏅️🔥](https://github.com/EQTPartners/GenCeption/blob/main/Leaderboard.md). Start by creating your virtual environment:
26
 
27
  ```{bash}
28
  conda create --name genception python=3.10 -y
 
22
 
23
 
24
  ## Contribute
25
+ Please add your model details and results to `leaderboard/leaderboard.json` and **create a PR (Pull-Request)** to contribute your results to the [🔥🏅️**Leaderboard**🏅️🔥](https://github.com/EQTPartners/GenCeption/blob/main/leaderboard/Leaderboard.md). Start by creating your virtual environment:
26
 
27
  ```{bash}
28
  conda create --name genception python=3.10 -y
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.background import BackgroundScheduler
2
+ from leaderboard.content import (
3
+ TITLE,
4
+ BANNER,
5
+ INTRO,
6
+ INTRO2,
7
+ CITATION_BUTTON_LABEL,
8
+ CITATION_BUTTON_TEXT,
9
+ )
10
+ import gradio as gr
11
+ import pandas as pd
12
+ import json
13
+
14
+
15
+ df = pd.DataFrame()
16
+
17
+
18
+ def update_data():
19
+ global df
20
+ with open("leaderboard/leaderboard.json", "r") as f:
21
+ data = json.load(f)
22
+ df = create_dataframe(data)
23
+
24
+
25
+ def filter_columns(df, show_all):
26
+ if show_all:
27
+ return df
28
+ else:
29
+ mean_columns = [col for col in df.columns if "Mean" in col or col == "Model"]
30
+ return df[mean_columns]
31
+
32
+
33
+ def create_dataframe(data):
34
+ rows = []
35
+ for model in data["models"]:
36
+ name_with_link = f'<a href="{model["url"]}" target="_blank" style="color: blue; text-decoration: underline;">{model["name"]}</a>'
37
+ row = {"Model": name_with_link}
38
+ row.update(model["scores"])
39
+ rows.append(row)
40
+
41
+ df = pd.DataFrame(rows)
42
+
43
+ for col in df.columns:
44
+ if "Mean" in col:
45
+ df[col] = df[col].apply(lambda x: f"<strong>{x}</strong>")
46
+
47
+ return df
48
+
49
+
50
+ def update_display(show_all, df):
51
+ filtered_df = filter_columns(df, show_all)
52
+ legend_visibility = gr.update(visible=show_all)
53
+ return filtered_df, legend_visibility
54
+
55
+ update_data()
56
+ demo = gr.Blocks()
57
+ with demo:
58
+ gr.HTML(TITLE)
59
+ gr.HTML(BANNER)
60
+ gr.Markdown(INTRO, elem_classes="markdown-text")
61
+ gr.Markdown(INTRO2, elem_classes="markdown-text")
62
+ show_all_columns = gr.Checkbox(label="Show all datasets", value=True)
63
+ data_display = gr.Dataframe(df, datatype="markdown")
64
+
65
+ legend_accordion = gr.Accordion("Legend:", open=False, visible=True)
66
+ with legend_accordion:
67
+ gr.Markdown(
68
+ """
69
+ - Exist.: Existence
70
+ - Count: Count
71
+ - Posi.: Position
72
+ - Col.: Color
73
+ - Post.: Poster
74
+ - Cel.: Celebrity
75
+ - Sce.: Scene
76
+ - Lan.: Landmark
77
+ - Art.: Artwork
78
+ - Com. R.: Commonsense Reasoning
79
+ - Code: Code Reasoning
80
+ - Num.: Numerical Calculation
81
+ - Tran.: Text Translation
82
+ - OCR: OCR
83
+ """
84
+ )
85
+
86
+ with gr.Row():
87
+ with gr.Accordion("📙 Citation", open=False):
88
+ citation_button = gr.Textbox(
89
+ value=CITATION_BUTTON_TEXT,
90
+ label=CITATION_BUTTON_LABEL,
91
+ elem_id="citation-button",
92
+ lines=10,
93
+ show_copy_button=True,
94
+ )
95
+
96
+ show_all_columns.change(
97
+ update_display,
98
+ inputs=[show_all_columns, gr.State(df)],
99
+ outputs=[data_display, legend_accordion],
100
+ )
101
+
102
+
103
+ scheduler = BackgroundScheduler()
104
+ scheduler.add_job(update_data, "cron", hour=0) # Update data once a day at midnight
105
+ scheduler.start()
106
+
107
+ demo.queue(default_concurrency_limit=40).launch(share=True)
leaderboard/Leaderboard.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # 🔥🏅️GenCeption Leaderboard 🏅️🔥
4
+
5
+
6
+
7
+ </div>
8
+
9
+ #### GC@3 scores for different models and categories:
10
+ | Model | **Mean** | Exist. | Count | Posi. | Col. | Post. | Cel. | Sce. | Lan. | Art. | Comm. | **Vis Mean** | Code | Num. | Tran. | OCR | **Text Mean** |
11
+ |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12
+ | [ChatGPT-4V](https://cdn.openai.com/papers/GPTV_System_Card.pdf) | **0.351** | 0.422 | 0.404 | 0.408 | 0.403 | 0.324 | 0.332 | 0.393 | 0.353 | 0.421 | 0.471 | **0.393** | 0.193 | 0.24 | 0.157 | 0.393 | **0.246** |
13
+ | [mPLUG-Owl2](https://arxiv.org/pdf/2311.04257.pdf) | **0.257** | 0.323 | 0.299 | 0.306 | 0.29 | 0.243 | 0.232 | 0.299 | 0.275 | 0.252 | 0.353 | **0.287** | 0.176 | 0.192 | 0.081 | 0.276 | **0.181** |
14
+ | [LLaVA-13B](https://arxiv.org/pdf/2304.08485.pdf) | **0.238** | 0.305 | 0.294 | 0.255 | 0.3 | 0.215 | 0.206 | 0.277 | 0.242 | 0.212 | 0.334 | **0.264** | 0.144 | 0.195 | 0.116 | 0.239 | **0.174** |
15
+ | [LLaVA-7B](https://arxiv.org/pdf/2304.08485.pdf) | **0.225** | 0.308 | 0.253 | 0.285 | 0.284 | 0.214 | 0.188 | 0.266 | 0.252 | 0.21 | 0.294 | **0.255** | 0.107 | 0.155 | 0.111 | 0.222 | **0.149** |
16
+
17
+
18
+ Legend:
19
+ - Exist.: Existence
20
+ - Count: Count
21
+ - Posi.: Position
22
+ - Col.: Color
23
+ - Post.: Poster
24
+ - Cel.: Celebrity
25
+ - Sce.: Scene
26
+ - Lan.: Landmark
27
+ - Art.: Artwork
28
+ - Com. R.: Commonsense Reasoning
29
+ - Code: Code Reasoning
30
+ - Num.: Numerical Calculation
31
+ - Tran.: Text Translation
32
+ - OCR: OCR
leaderboard/content.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">🔥🏅️GenCeption Leaderboard 🏅️🔥</h1>"""
2
+
3
+ BANNER = """<div>
4
+ <p align="center">
5
+ <a href="https://github.com/EQTPartners/GenCeption/">GitHub</a>&emsp;•&emsp;
6
+ <a href="https://github.com/EQTPartners/GenCeption#contribute">Contribute</a>&emsp;•&emsp;
7
+ <a href="https://arxiv.org/abs/2402.14973">Paper</a>&emsp;•&emsp;
8
+ <a href="https://github.com/EQTPartners/GenCeption#cite-this-work">Citation</a>
9
+ </p>
10
+ """
11
+
12
+ INTRO = """GenCeption is an annotation-free MLLM (Multimodal Large Language Model) evaluation framework that merely requires unimodal data to assess inter-modality semantic coherence and inversely reflects the models' inclination to hallucinate."""
13
+ INTRO2 = """This leaderboard displays the evaluated models ranked by their performance on the **GC@3** metric, as defined in [GenCeption: Evaluate Multimodal LLMs with Unlabeled Unimodal Data](https://arxiv.org/abs/2402.14973). For contributing a model evaluation, please submit a pull request on [GitHub](https://github.com/EQTPartners/GenCeption)."""
14
+
15
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite this benchmark"
16
+ CITATION_BUTTON_TEXT = r"""
17
+ @article{cao2023genception,
18
+ author = {Lele Cao and
19
+ Valentin Buchner and
20
+ Zineb Senane and
21
+ Fangkai Yang},
22
+ title = {{GenCeption}: Evaluate Multimodal LLMs with Unlabeled Unimodal Data},
23
+ year={2023},
24
+ journal={arXiv preprint arXiv:2402.14973},
25
+ primaryClass={cs.AI,cs.CL,cs.LG}
26
+ }
27
+ """
leaderboard/leaderboard.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "name": "ChatGPT-4V",
5
+ "url": "https://cdn.openai.com/papers/GPTV_System_Card.pdf",
6
+ "scores": {
7
+ "Mean": 0.351,
8
+ "Exist": 0.422,
9
+ "Count": 0.404,
10
+ "Posi": 0.408,
11
+ "Col": 0.403,
12
+ "Post": 0.324,
13
+ "Cel": 0.332,
14
+ "Sce": 0.393,
15
+ "Lan": 0.353,
16
+ "Art": 0.421,
17
+ "Comm": 0.471,
18
+ "VisMean": 0.393,
19
+ "Code": 0.193,
20
+ "Num": 0.240,
21
+ "Tran": 0.157,
22
+ "OCR": 0.393,
23
+ "TextMean": 0.246
24
+ }
25
+ },
26
+ {
27
+ "name": "mPLUG-Owl2",
28
+ "url": "https://arxiv.org/pdf/2311.04257.pdf",
29
+ "scores": {
30
+ "Mean": 0.257,
31
+ "Exist": 0.323,
32
+ "Count": 0.299,
33
+ "Posi": 0.306,
34
+ "Col": 0.290,
35
+ "Post": 0.243,
36
+ "Cel": 0.232,
37
+ "Sce": 0.299,
38
+ "Lan": 0.275,
39
+ "Art": 0.252,
40
+ "Comm": 0.353,
41
+ "VisMean": 0.287,
42
+ "Code": 0.176,
43
+ "Num": 0.192,
44
+ "Tran": 0.081,
45
+ "OCR": 0.276,
46
+ "TextMean": 0.181
47
+ }
48
+ },
49
+ {
50
+ "name": "LLaVA-13B",
51
+ "url": "https://arxiv.org/pdf/2304.08485.pdf",
52
+ "scores": {
53
+ "Mean": 0.238,
54
+ "Exist": 0.305,
55
+ "Count": 0.294,
56
+ "Posi": 0.255,
57
+ "Col": 0.300,
58
+ "Post": 0.215,
59
+ "Cel": 0.206,
60
+ "Sce": 0.277,
61
+ "Lan": 0.242,
62
+ "Art": 0.212,
63
+ "Comm": 0.334,
64
+ "VisMean": 0.264,
65
+ "Code": 0.144,
66
+ "Num": 0.195,
67
+ "Tran": 0.116,
68
+ "OCR": 0.239,
69
+ "TextMean": 0.174
70
+ }
71
+ },
72
+ {
73
+ "name": "LLaVA-7B",
74
+ "url": "https://arxiv.org/pdf/2304.08485.pdf",
75
+ "scores": {
76
+ "Mean": 0.225,
77
+ "Exist": 0.308,
78
+ "Count": 0.253,
79
+ "Posi": 0.285,
80
+ "Col": 0.284,
81
+ "Post": 0.214,
82
+ "Cel": 0.188,
83
+ "Sce": 0.266,
84
+ "Lan": 0.252,
85
+ "Art": 0.210,
86
+ "Comm": 0.294,
87
+ "VisMean": 0.255,
88
+ "Code": 0.107,
89
+ "Num": 0.155,
90
+ "Tran": 0.111,
91
+ "OCR": 0.222,
92
+ "TextMean": 0.149
93
+ }
94
+ }
95
+ ]
96
+ }
97
+