MohamedRashad commited on
Commit
6efebdc
Β·
1 Parent(s): 9440e3a

Add retrieval and reranking leaderboard modules, update requirements and README

Browse files
.gitignore ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # .env file
177
+ .env
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“Š
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.14.0
8
  app_file: app.py
9
  pinned: true
10
  short_description: The only leaderboard you will require for your RAG needs πŸ†
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.24.0
8
  app_file: app.py
9
  pinned: true
10
  short_description: The only leaderboard you will require for your RAG needs πŸ†
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import gradio as gr
2
- from utils import submit_gradio_module, load_retrieval_results, load_reranking_results
3
- from fuzzywuzzy import fuzz
 
 
 
4
 
5
  HEADER = """<div style="text-align: center; margin-bottom: 20px;">
6
  <h1>The Arabic RAG Leaderboard</h1>
@@ -13,68 +16,6 @@ This leaderboard presents the first comprehensive benchmark for Arabic RAG syste
13
  For technical details, check our blog post <a href="https://huggingface.co/blog/Navid-AI/arabic-rag-leaderboard">here</a>.
14
  """
15
 
16
- RETRIEVAL_ABOUT_SECTION = """
17
- ## About Retrieval Evaluation
18
-
19
- The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
20
-
21
- ### Web Search Dataset Metrics
22
- - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
23
- - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
24
- - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
25
- - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
26
-
27
- ### Model Requirements
28
- - Must support Arabic text embeddings
29
- - Should handle queries of at least 512 tokens
30
- - Must work with `sentence-transformers` library
31
-
32
- ### Evaluation Process
33
- 1. Models process Arabic web search queries
34
- 2. Retrieved documents are evaluated using:
35
- - MRR for first relevant result positioning
36
- - nDCG for overall ranking quality
37
- - Recall@5 for top results accuracy
38
- 3. Metrics are averaged to calculate the overall score
39
- 4. Models are ranked based on their overall performance
40
-
41
- ### How to Prepare Your Model
42
- - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
43
- - Model should output fixed-dimension embeddings for text
44
- - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
45
- """
46
-
47
- RERANKER_ABOUT_SECTION = """
48
- ## About Reranking Evaluation
49
-
50
- The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
51
-
52
- ### Evaluation Metrics
53
- - **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
54
- - **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
55
- - **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
56
-
57
- All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
58
-
59
- ### Model Requirements
60
- - Must accept query-document pairs as input
61
- - Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
62
- - Support for Arabic text processing
63
-
64
- ### Evaluation Process
65
- 1. Models are tested on multiple unseen Arabic datasets
66
- 2. For each dataset:
67
- - Initial candidate documents are provided
68
- - Model reranks the candidates
69
- - MRR@10, NDCG@10, and MAP are calculated
70
- 3. Final scores are averaged across all datasets
71
- 4. Models are ranked based on overall performance
72
-
73
- ### How to Prepare Your Model
74
- - Model should be public on HuggingFace Hub (private models are not supported yet)
75
- - Make sure it works coherently with `sentence-transformers` library
76
- """
77
-
78
  CITATION_BUTTON_LABEL = """
79
  Copy the following snippet to cite these results
80
  """
@@ -89,162 +30,20 @@ CITATION_BUTTON_TEXT = """
89
  }
90
  """
91
 
92
- retrieval_df = None
93
- reranking_df = None
94
-
95
- def search_leaderboard(df, model_name, columns_to_show, threshold=95):
96
- if not model_name.strip():
97
- return df.loc[:, columns_to_show]
98
- search_name = model_name.lower() # compute once for efficiency
99
- def calculate_similarity(row):
100
- return fuzz.partial_ratio(search_name, row["Model"].lower())
101
- filtered_df = df.copy()
102
- filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1)
103
- filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False)
104
- filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show]
105
- return filtered_df
106
-
107
- def retrieval_search_leaderboard(model_name, columns_to_show):
108
- return search_leaderboard(retrieval_df, model_name, columns_to_show)
109
-
110
- def reranking_search_leaderboard(model_name, columns_to_show):
111
- return search_leaderboard(reranking_df, model_name, columns_to_show)
112
-
113
- def update_retrieval_columns_to_show(columns_to_show):
114
- global retrieval_df
115
- dummy_df = retrieval_df.loc[:, [col for col in retrieval_df.columns if col in columns_to_show]]
116
- columns_widths = []
117
- for col in dummy_df.columns:
118
- if col == "Rank":
119
- columns_widths.append(80)
120
- elif col == "Model":
121
- columns_widths.append(400)
122
- else:
123
- columns_widths.append(150)
124
- return gr.update(value=dummy_df, column_widths=columns_widths)
125
-
126
- def update_reranker_columns_to_show(columns_to_show):
127
- global reranking_df
128
- dummy_df = reranking_df.loc[:, [col for col in reranking_df.columns if col in columns_to_show]]
129
- columns_widths = []
130
- for col in dummy_df.columns:
131
- if col == "Rank":
132
- columns_widths.append(80)
133
- elif col == "Model":
134
- columns_widths.append(400)
135
- else:
136
- columns_widths.append(150)
137
- return gr.update(value=dummy_df, column_widths=columns_widths)
138
-
139
  def main():
140
- global retrieval_df, reranking_df
141
-
142
- # Prepare retrieval dataframe
143
- retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"])
144
- retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))
145
- retrieval_df = retrieval_df[['Rank', 'Model', 'Web Search Dataset (Overall Score)', 'Model Size (MB)', 'Embedding Dimension', 'Max Tokens', 'Num Likes', 'Downloads Last Month', 'Web Search Dataset (MRR)', 'Web Search Dataset (nDCG@k=None)', 'Web Search Dataset (Recall@5)', 'License']]
146
- retrieval_columns_to_show = ["Rank", "Model", "Web Search Dataset (Overall Score)", "Model Size (MB)", "Embedding Dimension", "Max Tokens", "Num Likes"]
147
- retrieval_columns_widths = [80, 400, 150, 150, 150, 150, 150]
148
- retrieval_cols = retrieval_df.columns.tolist() # cache columns
149
-
150
- # Prepare reranking dataframe
151
- reranking_df = load_reranking_results(True, sort_col="Overall Score", drop_cols=["Revision", "Precision", "Task"])
152
- reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df)))
153
- reranking_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True)
154
- reranking_columns_to_show = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)", "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"]
155
- reranking_columns_widths = [80, 400, 150, 150, 150, 150, 150, 150, 150]
156
- reranking_cols = reranking_df.columns.tolist() # cache columns
157
-
158
  with gr.Blocks() as demo:
159
  gr.HTML(HEADER)
160
 
161
  with gr.Tabs():
162
  with gr.Tab("πŸ•΅οΈβ€β™‚οΈ Retrieval"):
163
- with gr.Tabs():
164
- with gr.Tab("πŸ‘‘ Leaderboard"):
165
- with gr.Row():
166
- search_box_retrieval = gr.Textbox(
167
- placeholder="Search for models...",
168
- label="Search",
169
- scale=5
170
- )
171
- retrieval_columns_to_show_input = gr.CheckboxGroup(
172
- label="Columns to Show",
173
- choices=retrieval_cols, # use cached list
174
- value=retrieval_columns_to_show,
175
- scale=4
176
- )
177
-
178
- retrieval_leaderboard = gr.Dataframe(
179
- value=retrieval_df.loc[:, retrieval_columns_to_show],
180
- datatype="markdown",
181
- wrap=False,
182
- show_fullscreen_button=True,
183
- interactive=False,
184
- column_widths=retrieval_columns_widths
185
- )
186
-
187
- # Submit the search box and the leaderboard
188
- search_box_retrieval.input(
189
- retrieval_search_leaderboard,
190
- inputs=[search_box_retrieval, retrieval_columns_to_show_input],
191
- outputs=retrieval_leaderboard
192
- )
193
- retrieval_columns_to_show_input.select(
194
- update_retrieval_columns_to_show,
195
- inputs=retrieval_columns_to_show_input,
196
- outputs=retrieval_leaderboard
197
- )
198
-
199
- with gr.Tab("🏡️ Submit Retriever"):
200
- submit_gradio_module("Retriever")
201
-
202
- with gr.Tab("ℹ️ About"):
203
- gr.Markdown(RETRIEVAL_ABOUT_SECTION)
204
 
205
  with gr.Tab("πŸ“Š Reranking"):
206
- with gr.Tabs():
207
- with gr.Tab("πŸ‘‘ Leaderboard"):
208
- with gr.Row():
209
- search_box_reranker = gr.Textbox(
210
- placeholder="Search for models...",
211
- label="Search",
212
- scale=5
213
- )
214
- reranking_columns_to_show_input = gr.CheckboxGroup(
215
- label="Columns to Show",
216
- choices=reranking_cols, # use cached list
217
- value=reranking_columns_to_show,
218
- scale=4
219
- )
220
-
221
- reranker_leaderboard = gr.Dataframe(
222
- value=reranking_df[reranking_columns_to_show],
223
- datatype="markdown",
224
- wrap=False,
225
- show_fullscreen_button=True,
226
- interactive=False,
227
- column_widths=reranking_columns_widths
228
- )
229
-
230
- # Submit the search box and the leaderboard
231
- search_box_reranker.input(
232
- reranking_search_leaderboard,
233
- inputs=[search_box_reranker, reranking_columns_to_show_input],
234
- outputs=reranker_leaderboard
235
- )
236
- reranking_columns_to_show_input.select(
237
- update_reranker_columns_to_show,
238
- inputs=reranking_columns_to_show_input,
239
- outputs=reranker_leaderboard
240
- )
241
-
242
- with gr.Tab("🏡️ Submit Reranker"):
243
- submit_gradio_module("Reranker")
244
-
245
- with gr.Tab("ℹ️ About"):
246
- gr.Markdown(RERANKER_ABOUT_SECTION)
247
 
 
 
 
248
  with gr.Row():
249
  with gr.Accordion("πŸ“™ Citation", open=False):
250
  gr.Textbox(
 
1
  import gradio as gr
2
+ from retrieval_leaderboard import create_retrieval_tab
3
+ from reranking_leaderboard import create_reranking_tab
4
+ from llm_in_context_leaderboard import create_llm_in_context_tab
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
 
8
  HEADER = """<div style="text-align: center; margin-bottom: 20px;">
9
  <h1>The Arabic RAG Leaderboard</h1>
 
16
  For technical details, check our blog post <a href="https://huggingface.co/blog/Navid-AI/arabic-rag-leaderboard">here</a>.
17
  """
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  CITATION_BUTTON_LABEL = """
20
  Copy the following snippet to cite these results
21
  """
 
30
  }
31
  """
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  with gr.Blocks() as demo:
35
  gr.HTML(HEADER)
36
 
37
  with gr.Tabs():
38
  with gr.Tab("πŸ•΅οΈβ€β™‚οΈ Retrieval"):
39
+ create_retrieval_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  with gr.Tab("πŸ“Š Reranking"):
42
+ create_reranking_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # with gr.Tab("πŸ“Š LLM in Context"):
45
+ # create_llm_in_context_tab()
46
+
47
  with gr.Row():
48
  with gr.Accordion("πŸ“™ Citation", open=False):
49
  gr.Textbox(
leaderboard_tab.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from fuzzywuzzy import fuzz
4
+ from utils import submit_gradio_module
5
+
6
+ def search_leaderboard(df, model_name, columns_to_show, threshold=95):
7
+ """
8
+ Search the leaderboard for models matching the search term using fuzzy matching.
9
+
10
+ Args:
11
+ df: The dataframe containing all leaderboard data
12
+ model_name: The search term to find models
13
+ columns_to_show: List of columns to include in the result
14
+ threshold: Minimum similarity threshold (default: 95)
15
+
16
+ Returns:
17
+ Filtered dataframe with only matching models and selected columns
18
+ """
19
+ if not model_name.strip():
20
+ return df.loc[:, columns_to_show]
21
+ search_name = model_name.lower() # compute once for efficiency
22
+ def calculate_similarity(row):
23
+ return fuzz.partial_ratio(search_name, row["Model"].lower())
24
+ filtered_df = df.copy()
25
+ filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1)
26
+ filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False)
27
+ filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show]
28
+ return filtered_df
29
+
30
+ def update_columns_to_show(df, columns_to_show):
31
+ """
32
+ Update the displayed columns in the dataframe.
33
+
34
+ Args:
35
+ df: The dataframe to update
36
+ columns_to_show: List of columns to include
37
+
38
+ Returns:
39
+ gradio.update object with the updated dataframe
40
+ """
41
+ dummy_df = df.loc[:, [col for col in df.columns if col in columns_to_show]]
42
+ columns_widths = []
43
+ for col in dummy_df.columns:
44
+ if col == "Rank":
45
+ columns_widths.append(80)
46
+ elif col == "Model":
47
+ columns_widths.append(400)
48
+ else:
49
+ columns_widths.append(150)
50
+ return gr.update(value=dummy_df, column_widths=columns_widths)
51
+
52
+ def create_leaderboard_tab(df, initial_columns_to_show, search_function, update_function, about_section, task_type):
53
+ """
54
+ Create a complete leaderboard tab with search, column selection, and data display.
55
+
56
+ Args:
57
+ df: The dataframe containing the leaderboard data
58
+ initial_columns_to_show: Initial list of columns to display
59
+ search_function: Function to handle searching
60
+ update_function: Function to handle column updates
61
+ about_section: Markdown text for the About tab
62
+ task_type: Type of the task ("Retriever" or "Reranker")
63
+
64
+ Returns:
65
+ A gradio Tabs component with the complete leaderboard interface
66
+ """
67
+ columns_widths = [80 if col == "Rank" else 400 if col == "Model" else 150 for col in initial_columns_to_show]
68
+
69
+ with gr.Tabs() as tabs:
70
+ with gr.Tab("πŸ‘‘ Leaderboard"):
71
+ with gr.Column():
72
+ with gr.Row(equal_height=True):
73
+ search_box = gr.Textbox(
74
+ placeholder="Search for models...",
75
+ label="Search (You can also press Enter to search)",
76
+ scale=5
77
+ )
78
+ search_button = gr.Button(
79
+ value="Search",
80
+ variant="primary",
81
+ scale=1
82
+ )
83
+ columns_to_show_input = gr.CheckboxGroup(
84
+ label="Columns to Show",
85
+ choices=df.columns.tolist(),
86
+ value=initial_columns_to_show,
87
+ scale=4
88
+ )
89
+
90
+ leaderboard = gr.Dataframe(
91
+ value=df.loc[:, initial_columns_to_show],
92
+ datatype="markdown",
93
+ wrap=True,
94
+ show_fullscreen_button=True,
95
+ interactive=False,
96
+ column_widths=columns_widths
97
+ )
98
+
99
+ # Connect events
100
+ search_box.submit(
101
+ search_function,
102
+ inputs=[search_box, columns_to_show_input],
103
+ outputs=leaderboard
104
+ )
105
+ columns_to_show_input.select(
106
+ update_function,
107
+ inputs=columns_to_show_input,
108
+ outputs=leaderboard
109
+ )
110
+ search_button.click(
111
+ search_function,
112
+ inputs=[search_box, columns_to_show_input],
113
+ outputs=leaderboard
114
+ )
115
+
116
+ with gr.Tab("🏡️ Submit"):
117
+ submit_gradio_module(task_type)
118
+
119
+ with gr.Tab("ℹ️ About"):
120
+ gr.Markdown(about_section)
121
+
122
+ return tabs
llm_in_context_leaderboard.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from utils import load_json_results
3
+ import gradio as gr
4
+ from leaderboard_tab import search_leaderboard, update_columns_to_show, create_leaderboard_tab
5
+
6
+ # Constants
7
+ LLM_IN_CONTEXT_ABOUT_SECTION = """"""
8
+
9
+ # Global variables
10
+ llm_in_context_df = None
11
+
12
+ def load_reranking_leaderboard():
13
+ """Load and prepare the reranking leaderboard data"""
14
+ global llm_in_context_df
15
+
16
+ dataframe_path = Path(__file__).parent / "results" / "llm_in_context_results.json"
17
+
18
+ # Prepare dataframe
19
+ llm_in_context_df = load_json_results(
20
+ dataframe_path,
21
+ prepare_for_display=True,
22
+ sort_col="Overall Score",
23
+ drop_cols=["Revision", "Precision", "Task"]
24
+ )
25
+ llm_in_context_df.insert(0, "Rank", range(1, 1 + len(llm_in_context_df)))
26
+ llm_in_context_df.rename(columns={"nDCG": "nDCG@10", "MRR": "MRR@10"}, inplace=True)
27
+
28
+ return llm_in_context_df
29
+
30
+ def reranking_search_leaderboard(model_name, columns_to_show):
31
+ """Search function for reranking leaderboard"""
32
+ return search_leaderboard(llm_in_context_df, model_name, columns_to_show)
33
+
34
+ def update_reranker_columns_to_show(columns_to_show):
35
+ """Update displayed columns for reranking leaderboard"""
36
+ return update_columns_to_show(llm_in_context_df, columns_to_show)
37
+
38
+ def create_llm_in_context_tab():
39
+ """Create the complete reranking leaderboard tab"""
40
+ global llm_in_context_df
41
+
42
+ # Load data if not already loaded
43
+ if (llm_in_context_df is None):
44
+ llm_in_context_df = load_reranking_leaderboard()
45
+
46
+ # Define default columns to show
47
+ default_columns = ["Rank", "Model", "Overall Score", "Model Parameters (in Millions)",
48
+ "Embedding Dimensions", "Downloads Last Month", "MRR@10", "nDCG@10", "MAP"]
49
+
50
+ columns_widths = [80 if col == "Rank" else 400 if col == "Model" else 150 for col in initial_columns_to_show]
51
+
52
+ with gr.Tabs() as tabs:
53
+ with gr.Tab("πŸ‘‘ Context Dependant Leaderboard"):
54
+ with gr.Column():
55
+ with gr.Row(equal_height=True):
56
+ search_box = gr.Textbox(
57
+ placeholder="Search for models...",
58
+ label="Search (You can also press Enter to search)",
59
+ scale=5
60
+ )
61
+ search_button = gr.Button(
62
+ value="Search",
63
+ variant="primary",
64
+ scale=1
65
+ )
66
+ columns_to_show_input = gr.CheckboxGroup(
67
+ label="Columns to Show",
68
+ choices=llm_in_context_df.columns.tolist(),
69
+ value=initial_columns_to_show,
70
+ scale=4
71
+ )
72
+
73
+ leaderboard = gr.Dataframe(
74
+ value=llm_in_context_df.loc[:, initial_columns_to_show],
75
+ datatype="markdown",
76
+ wrap=False,
77
+ show_fullscreen_button=True,
78
+ interactive=False,
79
+ column_widths=columns_widths
80
+ )
81
+
82
+ # Connect events
83
+ search_box.submit(
84
+ search_function,
85
+ inputs=[search_box, columns_to_show_input],
86
+ outputs=leaderboard
87
+ )
88
+ columns_to_show_input.select(
89
+ update_function,
90
+ inputs=columns_to_show_input,
91
+ outputs=leaderboard
92
+ )
93
+ search_button.click(
94
+ search_function,
95
+ inputs=[search_box, columns_to_show_input],
96
+ outputs=leaderboard
97
+ )
98
+
99
+ with gr.Tab("πŸ’Ž Context About Leaderboard"):
100
+ with gr.Column():
101
+ with gr.Row(equal_height=True):
102
+ search_box = gr.Textbox(
103
+ placeholder="Search for models...",
104
+ label="Search (You can also press Enter to search)",
105
+ scale=5
106
+ )
107
+ search_button = gr.Button(
108
+ value="Search",
109
+ variant="primary",
110
+ scale=1
111
+ )
112
+ columns_to_show_input = gr.CheckboxGroup(
113
+ label="Columns to Show",
114
+ choices=llm_in_context_df.columns.tolist(),
115
+ value=initial_columns_to_show,
116
+ scale=4
117
+ )
118
+
119
+ leaderboard = gr.Dataframe(
120
+ value=llm_in_context_df.loc[:, initial_columns_to_show],
121
+ datatype="markdown",
122
+ wrap=False,
123
+ show_fullscreen_button=True,
124
+ interactive=False,
125
+ column_widths=columns_widths
126
+ )
127
+
128
+ # Connect events
129
+ search_box.submit(
130
+ search_function,
131
+ inputs=[search_box, columns_to_show_input],
132
+ outputs=leaderboard
133
+ )
134
+ columns_to_show_input.select(
135
+ update_function,
136
+ inputs=columns_to_show_input,
137
+ outputs=leaderboard
138
+ )
139
+ search_button.click(
140
+ search_function,
141
+ inputs=[search_box, columns_to_show_input],
142
+ outputs=leaderboard
143
+ )
144
+
145
+ with gr.Tab("🏡️ Submit"):
146
+ submit_gradio_module(task_type)
147
+
148
+ with gr.Tab("ℹ️ About"):
149
+ gr.Markdown(about_section)
150
+
151
+ return tabs
152
+
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  fuzzywuzzy
2
- Levenshtein
 
 
1
  fuzzywuzzy
2
+ Levenshtein
3
+ python-dotenv
reranking_leaderboard.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from utils import load_json_results
3
+ from leaderboard_tab import search_leaderboard, update_columns_to_show, create_leaderboard_tab
4
+
5
+ # Constants
6
+ RERANKER_ABOUT_SECTION = """
7
+ ## About Reranking Evaluation
8
+
9
+ The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
10
+
11
+ ### Evaluation Metrics
12
+ - **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
13
+ - **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
14
+ - **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
15
+
16
+ All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
17
+
18
+ ### Model Requirements
19
+ - Must accept query-document pairs as input
20
+ - Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
21
+ - Support for Arabic text processing
22
+
23
+ ### Evaluation Process
24
+ 1. Models are tested on multiple unseen Arabic datasets
25
+ 2. For each dataset:
26
+ - Initial candidate documents are provided
27
+ - Model reranks the candidates
28
+ - MRR@10, NDCG@10, and MAP are calculated
29
+ 3. Final scores are averaged across all datasets
30
+ 4. Models are ranked based on overall performance
31
+
32
+ ### How to Prepare Your Model
33
+ - Model should be public on HuggingFace Hub (private models are not supported yet)
34
+ - Make sure it works coherently with `sentence-transformers` library
35
+ """
36
+
37
+ # Global variables
38
+ reranking_df = None
39
+
40
+ def load_reranking_results(prepare_for_display=False, sort_col=None, drop_cols=None):
41
+ dataframe_path = Path(__file__).parent / "results" / "reranking_results.json"
42
+ return load_json_results(
43
+ dataframe_path,
44
+ prepare_for_display=prepare_for_display,
45
+ sort_col=sort_col,
46
+ drop_cols=drop_cols
47
+ )
48
+
49
+ def load_reranking_leaderboard():
50
+ """Load and prepare the reranking leaderboard data"""
51
+ global reranking_df
52
+
53
+ # Prepare reranking dataframe
54
+ reranking_df = load_reranking_results(True, sort_col="Average Score", drop_cols=["Revision", "Precision", "Task"])
55
+ reranking_df.insert(0, "Rank", range(1, 1 + len(reranking_df)))
56
+
57
+ return reranking_df
58
+
59
+ def reranking_search_leaderboard(model_name, columns_to_show):
60
+ """Search function for reranking leaderboard"""
61
+ return search_leaderboard(reranking_df, model_name, columns_to_show)
62
+
63
+ def update_reranker_columns_to_show(columns_to_show):
64
+ """Update displayed columns for reranking leaderboard"""
65
+ return update_columns_to_show(reranking_df, columns_to_show)
66
+
67
+ def create_reranking_tab():
68
+ """Create the complete reranking leaderboard tab"""
69
+ global reranking_df
70
+
71
+ # Load data if not already loaded
72
+ if (reranking_df is None):
73
+ reranking_df = load_reranking_leaderboard()
74
+
75
+ # Define default columns to show
76
+ default_columns = ["Rank", "Model", "Average Score", "Model Size (MB)", "Context Length",
77
+ "Embedding Dimension", "Namaa Global Knowledge", "Navid General Knowledge"]
78
+
79
+ # Create and return the tab
80
+ return create_leaderboard_tab(
81
+ df=reranking_df,
82
+ initial_columns_to_show=default_columns,
83
+ search_function=reranking_search_leaderboard,
84
+ update_function=update_reranker_columns_to_show,
85
+ about_section=RERANKER_ABOUT_SECTION,
86
+ task_type="Reranker"
87
+ )
results/reranking_results.json CHANGED
@@ -1,242 +1,506 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "Model": "BAAI/bge-reranker-v2-m3",
4
- "Overall Score": 85.4,
5
- "Embedding Dimensions": 1024,
6
- "Model Parameters (in Millions)": 568.0,
7
- "Downloads Last Month": 966662,
8
- "MRR": 79.41,
9
- "nDCG": 95.1,
10
- "MAP": 81.69,
11
- "Num Likes": 491,
12
- "License": "apache-2.0",
13
- "Precision": "F32",
14
  "Task": "Reranker",
15
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  },
17
  {
18
- "Model": "NAMAA-Space/Namaa-ARA-Reranker-V1",
19
- "Overall Score": 84.1,
20
- "Embedding Dimensions": 1024,
21
- "Model Parameters (in Millions)": 568.0,
22
- "Downloads Last Month": 121,
23
- "MRR": 76.48,
24
- "nDCG": 93.14,
25
- "MAP": 82.67,
26
- "Num Likes": 4,
27
- "License": "apache-2.0",
28
- "Precision": "F32",
29
  "Task": "Reranker",
30
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  },
32
  {
33
  "Model": "NAMAA-Space/GATE-Reranker-V1",
34
- "Overall Score": 76.81,
35
- "Embedding Dimensions": 768,
36
- "Model Parameters (in Millions)": 135.0,
37
- "Downloads Last Month": 753,
38
- "MRR": 62.61,
39
- "nDCG": 89.7,
40
- "MAP": 78.11,
 
 
 
 
 
 
 
 
 
41
  "Num Likes": 7,
42
- "License": "apache-2.0",
43
- "Precision": "F32",
 
 
 
 
44
  "Task": "Reranker",
45
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
- "Model": "cross-encoder/ms-marco-MiniLM-L-12-v2",
49
- "Overall Score": 60.54,
50
- "Embedding Dimensions": 384,
51
- "Model Parameters (in Millions)": 33.4,
52
- "Downloads Last Month": 777681,
53
- "MRR": 41.69,
54
- "nDCG": 67.62,
55
- "MAP": 72.32,
56
- "Num Likes": 68,
57
- "License": "apache-2.0",
58
- "Precision": "I64",
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "Task": "Reranker",
60
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
  "Model": "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
64
- "Overall Score": 58.17,
65
- "Embedding Dimensions": 384,
66
- "Model Parameters (in Millions)": 118.0,
67
- "Downloads Last Month": 448,
68
- "MRR": 42.28,
69
- "nDCG": 75.63,
70
- "MAP": 56.59,
 
 
 
 
 
 
 
 
 
71
  "Num Likes": 4,
72
- "License": "apache-2.0",
73
- "Precision": "F32",
 
 
 
 
74
  "Task": "Reranker",
75
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  },
77
  {
78
- "Model": "aubmindlab/bert-base-arabert",
79
- "Overall Score": 56.69,
80
- "Embedding Dimensions": 768,
81
- "Model Parameters (in Millions)": 136.0,
82
- "Downloads Last Month": 71050,
83
- "MRR": 40.16,
84
- "nDCG": 71.14,
85
- "MAP": 58.77,
86
- "Num Likes": 27,
87
- "License": "N/A",
88
- "Precision": "F32",
89
  "Task": "Reranker",
90
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  },
92
  {
93
- "Model": "OrdalieTech/Solon-embeddings-large-0.1",
94
- "Overall Score": 54.73,
95
- "Embedding Dimensions": 1024,
96
- "Model Parameters (in Millions)": 560.0,
97
- "Downloads Last Month": 23830,
98
- "MRR": 32.59,
99
- "nDCG": 60.18,
100
- "MAP": 71.41,
101
- "Num Likes": 47,
102
- "License": "mit",
103
- "Precision": "F32",
104
  "Task": "Reranker",
105
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  },
107
  {
108
- "Model": "silma-ai/silma-embeddding-matryoshka-v0.1",
109
- "Overall Score": 54.16,
110
- "Embedding Dimensions": 768,
111
- "Model Parameters (in Millions)": 135.0,
112
- "Downloads Last Month": 957,
113
- "MRR": 35.6,
114
- "nDCG": 63.25,
115
- "MAP": 63.64,
116
- "Num Likes": 9,
117
- "License": "apache-2.0",
118
- "Precision": "F32",
119
  "Task": "Reranker",
120
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  },
122
  {
123
- "Model": "sentence-transformers/LaBSE",
124
- "Overall Score": 53.58,
125
- "Embedding Dimensions": 768,
126
- "Model Parameters (in Millions)": 471.0,
127
- "Downloads Last Month": 745051,
128
- "MRR": 32.9,
129
- "nDCG": 67.82,
130
- "MAP": 60.02,
131
- "Num Likes": 242,
132
- "License": "apache-2.0",
133
- "Precision": "F32",
134
  "Task": "Reranker",
135
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  },
137
  {
138
- "Model": "intfloat/multilingual-e5-large-instruct",
139
- "Overall Score": 53.39,
140
- "Embedding Dimensions": 1024,
141
- "Model Parameters (in Millions)": 560.0,
142
- "Downloads Last Month": 391080,
143
- "MRR": 48.35,
144
- "nDCG": 56.06,
145
- "MAP": 55.78,
146
- "Num Likes": 288,
147
- "License": "mit",
148
- "Precision": "F16",
149
  "Task": "Reranker",
150
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  },
152
  {
153
- "Model": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
154
- "Overall Score": 52.16,
155
- "Embedding Dimensions": 768,
156
- "Model Parameters (in Millions)": 135.0,
157
- "Downloads Last Month": 503,
158
- "MRR": 40.58,
159
- "nDCG": 54.7,
160
- "MAP": 61.2,
161
- "Num Likes": 2,
162
- "License": "apache-2.0",
163
- "Precision": "F32",
164
  "Task": "Reranker",
165
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  },
167
  {
168
- "Model": "Snowflake/snowflake-arctic-embed-l-v2.0",
169
- "Overall Score": 51.61,
170
- "Embedding Dimensions": 1024,
171
- "Model Parameters (in Millions)": 568.0,
172
- "Downloads Last Month": 86764,
173
- "MRR": 32.16,
174
- "nDCG": 58.63,
175
- "MAP": 64.05,
176
- "Num Likes": 111,
177
- "License": "apache-2.0",
178
- "Precision": "F32",
179
  "Task": "Reranker",
180
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
  {
183
  "Model": "aubmindlab/bert-large-arabertv2",
184
- "Overall Score": 50.84,
185
- "Embedding Dimensions": 1024,
186
- "Model Parameters (in Millions)": 371.0,
187
- "Downloads Last Month": 1352,
188
- "MRR": 27.84,
189
- "nDCG": 53.42,
190
- "MAP": 71.27,
 
 
 
 
 
 
 
 
 
191
  "Num Likes": 11,
192
- "License": "N/A",
193
- "Precision": "I64",
 
 
 
 
194
  "Task": "Reranker",
195
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  },
197
  {
198
- "Model": "Lajavaness/bilingual-embedding-large",
199
- "Overall Score": 49.91,
200
- "Embedding Dimensions": 1024,
201
- "Model Parameters (in Millions)": 560.0,
202
- "Downloads Last Month": 20987,
203
- "MRR": 33.93,
204
- "nDCG": 57.33,
205
- "MAP": 58.46,
206
- "Num Likes": 17,
207
- "License": "apache-2.0",
208
- "Precision": "F32",
209
- "Task": "Reranker",
210
- "Revision": "main"
211
- },
212
- {
213
- "Model": "jinaai/jina-embeddings-v3",
214
- "Overall Score": 49.44,
215
- "Embedding Dimensions": 4096,
216
- "Model Parameters (in Millions)": 572.0,
217
- "Downloads Last Month": 1523322,
218
- "MRR": 32.03,
219
- "nDCG": 60.41,
220
- "MAP": 55.87,
221
- "Num Likes": 726,
222
- "License": "cc-by-nc-4.0",
223
- "Precision": "BF16",
224
- "Task": "Reranker",
225
- "Revision": "main"
226
  },
227
  {
228
- "Model": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
229
- "Overall Score": 48.76,
230
- "Embedding Dimensions": 768,
231
- "Model Parameters (in Millions)": 471.0,
232
- "Downloads Last Month": 566,
233
- "MRR": 32.71,
234
- "nDCG": 56.6,
235
- "MAP": 56.97,
236
- "Num Likes": 2,
237
- "License": "apache-2.0",
238
- "Precision": "F32",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  "Task": "Reranker",
240
- "Revision": "main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  }
242
  ]
 
1
  [
2
+ {
3
+ "Model": "Alibaba-NLP/gte-multilingual-reranker-base",
4
+ "Revision": "main",
5
+ "Precision": "f16",
6
+ "Task": "Reranker",
7
+ "Average Score": 85.03,
8
+ "Context Length": 8192,
9
+ "Embedding Dimension": 768,
10
+ "Model Size (MB)": 582.44,
11
+ "Number of Parameters (Billions)": 0.305,
12
+ "Namaa Global Knowledge": 76.76,
13
+ "Namaa Tourism": 77.1,
14
+ "Namaa Media": 78.88,
15
+ "Namaa Legal": 85.87,
16
+ "Navid General Knowledge": 86.05,
17
+ "Navid Web Search": 99.36,
18
+ "Navid Islamic Studies": 91.18,
19
+ "Downloads Last Month": 215347,
20
+ "Num Likes": 114,
21
+ "License": "apache-2.0"
22
+ },
23
  {
24
  "Model": "BAAI/bge-reranker-v2-m3",
25
+ "Revision": "main",
26
+ "Precision": "f32",
 
 
 
 
 
 
 
 
27
  "Task": "Reranker",
28
+ "Average Score": 87.44,
29
+ "Context Length": 8192,
30
+ "Embedding Dimension": 1024,
31
+ "Model Size (MB)": 2165.81,
32
+ "Number of Parameters (Billions)": 0.568,
33
+ "Namaa Global Knowledge": 81.27,
34
+ "Namaa Tourism": 80.96,
35
+ "Namaa Media": 81.33,
36
+ "Namaa Legal": 88.58,
37
+ "Navid General Knowledge": 87.05,
38
+ "Navid Web Search": 99.3,
39
+ "Navid Islamic Studies": 93.59,
40
+ "Downloads Last Month": 1480691,
41
+ "Num Likes": 597,
42
+ "License": "apache-2.0"
43
  },
44
  {
45
+ "Model": "Lajavaness/bilingual-embedding-large",
46
+ "Revision": "main",
47
+ "Precision": "f32",
 
 
 
 
 
 
 
 
48
  "Task": "Reranker",
49
+ "Average Score": 47.37,
50
+ "Context Length": 512,
51
+ "Embedding Dimension": 1024,
52
+ "Model Size (MB)": 2135.81,
53
+ "Number of Parameters (Billions)": 0.56,
54
+ "Namaa Global Knowledge": 46.48,
55
+ "Namaa Tourism": 46.61,
56
+ "Namaa Media": 46.01,
57
+ "Namaa Legal": 49.64,
58
+ "Navid General Knowledge": 47.87,
59
+ "Navid Web Search": 48.37,
60
+ "Navid Islamic Studies": 46.6,
61
+ "Downloads Last Month": 24311,
62
+ "Num Likes": 21,
63
+ "License": "apache-2.0"
64
  },
65
  {
66
  "Model": "NAMAA-Space/GATE-Reranker-V1",
67
+ "Revision": "main",
68
+ "Precision": "f32",
69
+ "Task": "Reranker",
70
+ "Average Score": 83.96,
71
+ "Context Length": 512,
72
+ "Embedding Dimension": 768,
73
+ "Model Size (MB)": 515.72,
74
+ "Number of Parameters (Billions)": 0.135,
75
+ "Namaa Global Knowledge": 77.02,
76
+ "Namaa Tourism": 79.6,
77
+ "Namaa Media": 77.22,
78
+ "Namaa Legal": 84.41,
79
+ "Navid General Knowledge": 77.61,
80
+ "Navid Web Search": 95.71,
81
+ "Navid Islamic Studies": 96.14,
82
+ "Downloads Last Month": 1274,
83
  "Num Likes": 7,
84
+ "License": "apache-2.0"
85
+ },
86
+ {
87
+ "Model": "NAMAA-Space/Namaa-ARA-Reranker-V1",
88
+ "Revision": "main",
89
+ "Precision": "f32",
90
  "Task": "Reranker",
91
+ "Average Score": 85.82,
92
+ "Context Length": 8192,
93
+ "Embedding Dimension": 1024,
94
+ "Model Size (MB)": 2165.81,
95
+ "Number of Parameters (Billions)": 0.568,
96
+ "Namaa Global Knowledge": 80.18,
97
+ "Namaa Tourism": 77.7,
98
+ "Namaa Media": 79.07,
99
+ "Namaa Legal": 87.62,
100
+ "Navid General Knowledge": 85.19,
101
+ "Navid Web Search": 98.81,
102
+ "Navid Islamic Studies": 92.19,
103
+ "Downloads Last Month": 41,
104
+ "Num Likes": 4,
105
+ "License": "apache-2.0"
106
  },
107
  {
108
+ "Model": "OmarAlsaabi/e5-base-mlqa-finetuned-arabic-for-rag",
109
+ "Revision": "main",
110
+ "Precision": "f16",
111
+ "Task": "Reranker",
112
+ "Average Score": 49.99,
113
+ "Context Length": 512,
114
+ "Embedding Dimension": 768,
115
+ "Model Size (MB)": 530.33,
116
+ "Number of Parameters (Billions)": 0.278,
117
+ "Namaa Global Knowledge": 48.65,
118
+ "Namaa Tourism": 47.23,
119
+ "Namaa Media": 51.33,
120
+ "Namaa Legal": 45.23,
121
+ "Navid General Knowledge": 52.75,
122
+ "Navid Web Search": 52.65,
123
+ "Navid Islamic Studies": 52.11,
124
+ "Downloads Last Month": 79,
125
+ "Num Likes": 5,
126
+ "License": "N/A"
127
+ },
128
+ {
129
+ "Model": "OmarAlsaabi/e5-base-mlqa-finetuned-arabic-for-rag",
130
+ "Revision": "main",
131
+ "Precision": "f32",
132
  "Task": "Reranker",
133
+ "Average Score": 51.83,
134
+ "Context Length": 512,
135
+ "Embedding Dimension": 768,
136
+ "Model Size (MB)": 1060.65,
137
+ "Number of Parameters (Billions)": 0.278,
138
+ "Namaa Global Knowledge": 50.2,
139
+ "Namaa Tourism": 53.47,
140
+ "Namaa Media": 51.93,
141
+ "Namaa Legal": 51.72,
142
+ "Navid General Knowledge": 55.01,
143
+ "Navid Web Search": 47.75,
144
+ "Navid Islamic Studies": 52.7,
145
+ "Downloads Last Month": 79,
146
+ "Num Likes": 5,
147
+ "License": "N/A"
148
  },
149
  {
150
  "Model": "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
151
+ "Revision": "main",
152
+ "Precision": "f32",
153
+ "Task": "Reranker",
154
+ "Average Score": 47.93,
155
+ "Context Length": 128,
156
+ "Embedding Dimension": 384,
157
+ "Model Size (MB)": 448.81,
158
+ "Number of Parameters (Billions)": 0.118,
159
+ "Namaa Global Knowledge": 50.18,
160
+ "Namaa Tourism": 49.26,
161
+ "Namaa Media": 48.13,
162
+ "Namaa Legal": 45.68,
163
+ "Navid General Knowledge": 43.49,
164
+ "Navid Web Search": 48.87,
165
+ "Navid Islamic Studies": 49.87,
166
+ "Downloads Last Month": 127,
167
  "Num Likes": 4,
168
+ "License": "apache-2.0"
169
+ },
170
+ {
171
+ "Model": "Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
172
+ "Revision": "main",
173
+ "Precision": "f32",
174
  "Task": "Reranker",
175
+ "Average Score": 49.33,
176
+ "Context Length": 512,
177
+ "Embedding Dimension": 768,
178
+ "Model Size (MB)": 515.72,
179
+ "Number of Parameters (Billions)": 0.135,
180
+ "Namaa Global Knowledge": 50.03,
181
+ "Namaa Tourism": 50.52,
182
+ "Namaa Media": 48.73,
183
+ "Namaa Legal": 50.86,
184
+ "Navid General Knowledge": 49.1,
185
+ "Navid Web Search": 49.56,
186
+ "Navid Islamic Studies": 46.49,
187
+ "Downloads Last Month": 8143,
188
+ "Num Likes": 10,
189
+ "License": "apache-2.0"
190
  },
191
  {
192
+ "Model": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
193
+ "Revision": "main",
194
+ "Precision": "f32",
 
 
 
 
 
 
 
 
195
  "Task": "Reranker",
196
+ "Average Score": 47.86,
197
+ "Context Length": 128,
198
+ "Embedding Dimension": 768,
199
+ "Model Size (MB)": 1060.65,
200
+ "Number of Parameters (Billions)": 0.278,
201
+ "Namaa Global Knowledge": 47.51,
202
+ "Namaa Tourism": 48.59,
203
+ "Namaa Media": 50.78,
204
+ "Namaa Legal": 46.82,
205
+ "Navid General Knowledge": 41.96,
206
+ "Navid Web Search": 51.73,
207
+ "Navid Islamic Studies": 47.65,
208
+ "Downloads Last Month": 291,
209
+ "Num Likes": 2,
210
+ "License": "apache-2.0"
211
  },
212
  {
213
+ "Model": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
214
+ "Revision": "main",
215
+ "Precision": "f32",
 
 
 
 
 
 
 
 
216
  "Task": "Reranker",
217
+ "Average Score": 49.74,
218
+ "Context Length": 256,
219
+ "Embedding Dimension": 768,
220
+ "Model Size (MB)": 1798.7,
221
+ "Number of Parameters (Billions)": 0.472,
222
+ "Namaa Global Knowledge": 48.34,
223
+ "Namaa Tourism": 48.06,
224
+ "Namaa Media": 49.59,
225
+ "Namaa Legal": 44.44,
226
+ "Navid General Knowledge": 60.29,
227
+ "Navid Web Search": 46.9,
228
+ "Navid Islamic Studies": 50.54,
229
+ "Downloads Last Month": 215,
230
+ "Num Likes": 2,
231
+ "License": "apache-2.0"
232
  },
233
  {
234
+ "Model": "OrdalieTech/Solon-embeddings-large-0.1",
235
+ "Revision": "main",
236
+ "Precision": "f32",
 
 
 
 
 
 
 
 
237
  "Task": "Reranker",
238
+ "Average Score": 49.55,
239
+ "Context Length": 512,
240
+ "Embedding Dimension": 1024,
241
+ "Model Size (MB)": 2135.81,
242
+ "Number of Parameters (Billions)": 0.56,
243
+ "Namaa Global Knowledge": 48.29,
244
+ "Namaa Tourism": 51.34,
245
+ "Namaa Media": 49.63,
246
+ "Namaa Legal": 43.93,
247
+ "Navid General Knowledge": 49.86,
248
+ "Navid Web Search": 50.01,
249
+ "Navid Islamic Studies": 53.8,
250
+ "Downloads Last Month": 7825,
251
+ "Num Likes": 50,
252
+ "License": "mit"
253
  },
254
  {
255
+ "Model": "Snowflake/snowflake-arctic-embed-l-v2.0",
256
+ "Revision": "main",
257
+ "Precision": "f32",
 
 
 
 
 
 
 
 
258
  "Task": "Reranker",
259
+ "Average Score": 50.7,
260
+ "Context Length": 8192,
261
+ "Embedding Dimension": 1024,
262
+ "Model Size (MB)": 2165.81,
263
+ "Number of Parameters (Billions)": 0.568,
264
+ "Namaa Global Knowledge": 47.45,
265
+ "Namaa Tourism": 48.7,
266
+ "Namaa Media": 51.26,
267
+ "Namaa Legal": 49.66,
268
+ "Navid General Knowledge": 49.72,
269
+ "Navid Web Search": 55.93,
270
+ "Navid Islamic Studies": 52.16,
271
+ "Downloads Last Month": 117067,
272
+ "Num Likes": 148,
273
+ "License": "apache-2.0"
274
  },
275
  {
276
+ "Model": "anondeb/arabertv02_reranker_2021",
277
+ "Revision": "main",
278
+ "Precision": "f32",
 
 
 
 
 
 
 
 
279
  "Task": "Reranker",
280
+ "Average Score": 82.28,
281
+ "Context Length": 512,
282
+ "Embedding Dimension": 768,
283
+ "Model Size (MB)": 515.72,
284
+ "Number of Parameters (Billions)": 0.135,
285
+ "Namaa Global Knowledge": 76.43,
286
+ "Namaa Tourism": 77.25,
287
+ "Namaa Media": 75.3,
288
+ "Namaa Legal": 82.63,
289
+ "Navid General Knowledge": 75.05,
290
+ "Navid Web Search": 94.11,
291
+ "Navid Islamic Studies": 95.18,
292
+ "Downloads Last Month": 23,
293
+ "Num Likes": 0,
294
+ "License": "cc-by-nc-4.0"
295
  },
296
  {
297
+ "Model": "asafaya/bert-base-arabic",
298
+ "Revision": "main",
299
+ "Precision": "f32",
 
 
 
 
 
 
 
 
300
  "Task": "Reranker",
301
+ "Average Score": 68.89,
302
+ "Context Length": 512,
303
+ "Embedding Dimension": 768,
304
+ "Model Size (MB)": 421.97,
305
+ "Number of Parameters (Billions)": 0.111,
306
+ "Namaa Global Knowledge": 63.6,
307
+ "Namaa Tourism": 59.54,
308
+ "Namaa Media": 61.14,
309
+ "Namaa Legal": 72.6,
310
+ "Navid General Knowledge": 63.27,
311
+ "Navid Web Search": 86.84,
312
+ "Navid Islamic Studies": 75.25,
313
+ "Downloads Last Month": 10439,
314
+ "Num Likes": 38,
315
+ "License": "N/A"
316
  },
317
  {
318
+ "Model": "aubmindlab/bert-base-arabert",
319
+ "Revision": "main",
320
+ "Precision": "f32",
 
 
 
 
 
 
 
 
321
  "Task": "Reranker",
322
+ "Average Score": 56.75,
323
+ "Context Length": 512,
324
+ "Embedding Dimension": 768,
325
+ "Model Size (MB)": 515.72,
326
+ "Number of Parameters (Billions)": 0.135,
327
+ "Namaa Global Knowledge": 51.19,
328
+ "Namaa Tourism": 50.61,
329
+ "Namaa Media": 53.32,
330
+ "Namaa Legal": 49.75,
331
+ "Navid General Knowledge": 58.99,
332
+ "Navid Web Search": 64.7,
333
+ "Navid Islamic Studies": 68.72,
334
+ "Downloads Last Month": 50761,
335
+ "Num Likes": 29,
336
+ "License": "N/A"
337
  },
338
  {
339
  "Model": "aubmindlab/bert-large-arabertv2",
340
+ "Revision": "main",
341
+ "Precision": "i64",
342
+ "Task": "Reranker",
343
+ "Average Score": 48.68,
344
+ "Context Length": 512,
345
+ "Embedding Dimension": 1024,
346
+ "Model Size (MB)": 1409.24,
347
+ "Number of Parameters (Billions)": 0.369,
348
+ "Namaa Global Knowledge": 46.56,
349
+ "Namaa Tourism": 46.53,
350
+ "Namaa Media": 46.39,
351
+ "Namaa Legal": 45.89,
352
+ "Navid General Knowledge": 45.88,
353
+ "Navid Web Search": 63.33,
354
+ "Navid Islamic Studies": 46.16,
355
+ "Downloads Last Month": 2059,
356
  "Num Likes": 11,
357
+ "License": "N/A"
358
+ },
359
+ {
360
+ "Model": "colbert-ir/colbertv2.0",
361
+ "Revision": "main",
362
+ "Precision": "i64",
363
  "Task": "Reranker",
364
+ "Average Score": 52.44,
365
+ "Context Length": 512,
366
+ "Embedding Dimension": 768,
367
+ "Model Size (MB)": 417.64,
368
+ "Number of Parameters (Billions)": 0.109,
369
+ "Namaa Global Knowledge": 47.92,
370
+ "Namaa Tourism": 51.21,
371
+ "Namaa Media": 50.97,
372
+ "Namaa Legal": 44.7,
373
+ "Navid General Knowledge": 76.33,
374
+ "Navid Web Search": 49.91,
375
+ "Navid Islamic Studies": 46.06,
376
+ "Downloads Last Month": 1409633,
377
+ "Num Likes": 248,
378
+ "License": "mit"
379
  },
380
  {
381
+ "Model": "cross-encoder/ms-marco-MiniLM-L-12-v2",
382
+ "Revision": "main",
383
+ "Precision": "i64",
384
+ "Task": "Reranker",
385
+ "Average Score": 59.62,
386
+ "Context Length": 512,
387
+ "Embedding Dimension": 384,
388
+ "Model Size (MB)": 127.26,
389
+ "Number of Parameters (Billions)": 0.033,
390
+ "Namaa Global Knowledge": 62.81,
391
+ "Namaa Tourism": 59.16,
392
+ "Namaa Media": 60.86,
393
+ "Namaa Legal": 68.86,
394
+ "Navid General Knowledge": 60.42,
395
+ "Navid Web Search": 51.06,
396
+ "Navid Islamic Studies": 54.17,
397
+ "Downloads Last Month": 498422,
398
+ "Num Likes": 70,
399
+ "License": "apache-2.0"
 
 
 
 
 
 
 
 
 
400
  },
401
  {
402
+ "Model": "intfloat/multilingual-e5-large-instruct",
403
+ "Revision": "main",
404
+ "Precision": "f16",
405
+ "Task": "Reranker",
406
+ "Average Score": 53.46,
407
+ "Context Length": 512,
408
+ "Embedding Dimension": 1024,
409
+ "Model Size (MB)": 1067.91,
410
+ "Number of Parameters (Billions)": 0.56,
411
+ "Namaa Global Knowledge": 52.6,
412
+ "Namaa Tourism": 51.09,
413
+ "Namaa Media": 52.51,
414
+ "Namaa Legal": 50.97,
415
+ "Navid General Knowledge": 67.73,
416
+ "Navid Web Search": 49.48,
417
+ "Navid Islamic Studies": 49.83,
418
+ "Downloads Last Month": 995915,
419
+ "Num Likes": 416,
420
+ "License": "mit"
421
+ },
422
+ {
423
+ "Model": "oddadmix/arabic-reranker-v1",
424
+ "Revision": "main",
425
+ "Precision": "f32",
426
+ "Task": "Reranker",
427
+ "Average Score": 79.93,
428
+ "Context Length": 512,
429
+ "Embedding Dimension": 768,
430
+ "Model Size (MB)": 515.72,
431
+ "Number of Parameters (Billions)": 0.135,
432
+ "Namaa Global Knowledge": 74.08,
433
+ "Namaa Tourism": 72.28,
434
+ "Namaa Media": 70.31,
435
+ "Namaa Legal": 78.21,
436
+ "Navid General Knowledge": 85.0,
437
+ "Navid Web Search": 85.65,
438
+ "Navid Islamic Studies": 93.98,
439
+ "Downloads Last Month": 23,
440
+ "Num Likes": 1,
441
+ "License": "N/A"
442
+ },
443
+ {
444
+ "Model": "omarelshehy/Arabic-Retrieval-v1.0",
445
+ "Revision": "main",
446
+ "Precision": "f32",
447
+ "Task": "Reranker",
448
+ "Average Score": 43.7,
449
+ "Context Length": 512,
450
+ "Embedding Dimension": 768,
451
+ "Model Size (MB)": 515.73,
452
+ "Number of Parameters (Billions)": 0.135,
453
+ "Namaa Global Knowledge": 47.98,
454
+ "Namaa Tourism": 51.39,
455
+ "Namaa Media": 47.91,
456
+ "Namaa Legal": 50.77,
457
+ "Navid General Knowledge": 50.42,
458
+ "Navid Web Search": 29.07,
459
+ "Navid Islamic Studies": 28.39,
460
+ "Downloads Last Month": 174,
461
+ "Num Likes": 1,
462
+ "License": "apache-2.0"
463
+ },
464
+ {
465
+ "Model": "sentence-transformers/LaBSE",
466
+ "Revision": "main",
467
+ "Precision": "i64",
468
  "Task": "Reranker",
469
+ "Average Score": 49.88,
470
+ "Context Length": 256,
471
+ "Embedding Dimension": 768,
472
+ "Model Size (MB)": 1798.7,
473
+ "Number of Parameters (Billions)": 0.472,
474
+ "Namaa Global Knowledge": 51.04,
475
+ "Namaa Tourism": 49.33,
476
+ "Namaa Media": 49.27,
477
+ "Namaa Legal": 45.63,
478
+ "Navid General Knowledge": 52.54,
479
+ "Navid Web Search": 49.96,
480
+ "Navid Islamic Studies": 51.39,
481
+ "Downloads Last Month": 558352,
482
+ "Num Likes": 258,
483
+ "License": "apache-2.0"
484
+ },
485
+ {
486
+ "Model": "silma-ai/silma-embeddding-matryoshka-v0.1",
487
+ "Revision": "main",
488
+ "Precision": "f32",
489
+ "Task": "Reranker",
490
+ "Average Score": 44.05,
491
+ "Context Length": 512,
492
+ "Embedding Dimension": 768,
493
+ "Model Size (MB)": 515.72,
494
+ "Number of Parameters (Billions)": 0.135,
495
+ "Namaa Global Knowledge": 47.17,
496
+ "Namaa Tourism": 46.42,
497
+ "Namaa Media": 47.97,
498
+ "Namaa Legal": 52.81,
499
+ "Navid General Knowledge": 54.47,
500
+ "Navid Web Search": 30.82,
501
+ "Navid Islamic Studies": 28.66,
502
+ "Downloads Last Month": 405,
503
+ "Num Likes": 11,
504
+ "License": "apache-2.0"
505
  }
506
  ]
results/retrieval_results.json CHANGED
@@ -1,162 +1,114 @@
1
  [
2
  {
3
- "Model": "Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet",
4
- "Max Tokens": 128,
5
  "Revision": "main",
6
- "Precision": "F32",
7
  "Task": "Retriever",
8
- "Embedding Dimension": 384,
9
- "Model Size (MB)": 448.81,
10
- "Web Search Dataset (MRR)": 46.0,
11
- "Web Search Dataset (Recall@5)": 56.19,
12
- "Web Search Dataset (nDCG@k=None)": 55.34,
13
- "Web Search Dataset (Overall Score)": 52.51,
14
- "Downloads Last Month": 448,
15
- "Num Likes": 4,
 
16
  "License": "apache-2.0"
17
  },
18
  {
19
- "Model": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
20
- "Max Tokens": 256,
21
  "Revision": "main",
22
- "Precision": "F32",
23
  "Task": "Retriever",
 
 
24
  "Embedding Dimension": 768,
25
- "Model Size (MB)": 1798.7,
26
- "Web Search Dataset (MRR)": 56.96,
27
- "Web Search Dataset (Recall@5)": 70.05,
28
- "Web Search Dataset (nDCG@k=None)": 65.27,
29
- "Web Search Dataset (Overall Score)": 64.09,
30
- "Downloads Last Month": 566,
31
- "Num Likes": 2,
32
  "License": "apache-2.0"
33
  },
34
  {
35
- "Model": "intfloat/multilingual-e5-large-instruct",
36
- "Max Tokens": 512,
37
  "Revision": "main",
38
- "Precision": "F16",
39
- "Task": "Retriever",
40
- "Embedding Dimension": 1024,
41
- "Model Size (MB)": 1067.91,
42
- "Web Search Dataset (MRR)": 65.26,
43
- "Web Search Dataset (Recall@5)": 74.14,
44
- "Web Search Dataset (nDCG@k=None)": 71.66,
45
- "Web Search Dataset (Overall Score)": 70.35,
46
- "Downloads Last Month": 391080,
47
- "Num Likes": 288,
48
- "License": "mit"
49
- },
50
- {
51
- "Model": "omarelshehy/Arabic-Retrieval-v1.0",
52
- "Max Tokens": 512,
53
- "Revision": "main",
54
- "Precision": "F32",
55
  "Task": "Retriever",
 
 
56
  "Embedding Dimension": 768,
57
- "Model Size (MB)": 515.73,
58
- "Web Search Dataset (MRR)": 55.32,
59
- "Web Search Dataset (Recall@5)": 64.76,
60
- "Web Search Dataset (nDCG@k=None)": 63.48,
61
- "Web Search Dataset (Overall Score)": 61.19,
62
- "Downloads Last Month": 194,
63
- "Num Likes": 0,
64
  "License": "apache-2.0"
65
  },
66
  {
67
- "Model": "omarelshehy/Arabic-STS-Matryoshka-V2",
68
- "Max Tokens": 512,
69
  "Revision": "main",
70
- "Precision": "F32",
71
  "Task": "Retriever",
 
 
72
  "Embedding Dimension": 768,
73
  "Model Size (MB)": 515.72,
74
- "Web Search Dataset (MRR)": 41.83,
75
- "Web Search Dataset (Recall@5)": 49.21,
76
- "Web Search Dataset (nDCG@k=None)": 50.71,
77
- "Web Search Dataset (Overall Score)": 47.25,
78
- "Downloads Last Month": 264,
79
- "Num Likes": 1,
80
- "License": "N/A"
81
- },
82
- {
83
- "Model": "omarelshehy/Arabic-STS-Matryoshka",
84
- "Max Tokens": 512,
85
- "Revision": "main",
86
- "Precision": "F32",
87
- "Task": "Retriever",
88
- "Embedding Dimension": 1024,
89
- "Model Size (MB)": 2135.81,
90
- "Web Search Dataset (MRR)": 63.2,
91
- "Web Search Dataset (Recall@5)": 74.41,
92
- "Web Search Dataset (nDCG@k=None)": 70.43,
93
- "Web Search Dataset (Overall Score)": 69.35,
94
- "Downloads Last Month": 167,
95
- "Num Likes": 2,
96
  "License": "apache-2.0"
97
  },
98
  {
99
- "Model": "omarelshehy/arabic-english-sts-matryoshka-v2.0",
100
- "Max Tokens": 512,
101
- "Revision": "main",
102
- "Precision": "F32",
103
- "Task": "Retriever",
104
- "Embedding Dimension": 1024,
105
- "Model Size (MB)": 2135.81,
106
- "Web Search Dataset (MRR)": 56.02,
107
- "Web Search Dataset (Recall@5)": 67.85,
108
- "Web Search Dataset (nDCG@k=None)": 64.47,
109
- "Web Search Dataset (Overall Score)": 62.78,
110
- "Downloads Last Month": 324,
111
- "Num Likes": 1,
112
- "License": "N/A"
113
- },
114
- {
115
- "Model": "omarelshehy/arabic-english-sts-matryoshka",
116
- "Max Tokens": 512,
117
  "Revision": "main",
118
- "Precision": "F32",
119
  "Task": "Retriever",
120
- "Embedding Dimension": 1024,
121
- "Model Size (MB)": 2135.81,
122
- "Web Search Dataset (MRR)": 51.5,
123
- "Web Search Dataset (Recall@5)": 62.91,
124
- "Web Search Dataset (nDCG@k=None)": 60.55,
125
- "Web Search Dataset (Overall Score)": 58.32,
126
- "Downloads Last Month": 295,
127
- "Num Likes": 0,
 
128
  "License": "apache-2.0"
129
  },
130
  {
131
- "Model": "silma-ai/silma-embeddding-matryoshka-v0.1",
132
- "Max Tokens": 512,
133
  "Revision": "main",
134
- "Precision": "F32",
135
  "Task": "Retriever",
 
 
136
  "Embedding Dimension": 768,
137
  "Model Size (MB)": 515.72,
138
- "Web Search Dataset (MRR)": 45.54,
139
- "Web Search Dataset (Recall@5)": 56.5,
140
- "Web Search Dataset (nDCG@k=None)": 55.32,
141
- "Web Search Dataset (Overall Score)": 52.46,
142
- "Downloads Last Month": 957,
143
- "Num Likes": 9,
144
- "License": "apache-2.0"
145
  },
146
  {
147
- "Model": "silma-ai/silma-embeddding-sts-v0.1",
148
- "Max Tokens": 512,
149
  "Revision": "main",
150
- "Precision": "F32",
151
  "Task": "Retriever",
 
 
152
  "Embedding Dimension": 768,
153
- "Model Size (MB)": 515.72,
154
- "Web Search Dataset (MRR)": 47.28,
155
- "Web Search Dataset (Recall@5)": 57.97,
156
- "Web Search Dataset (nDCG@k=None)": 56.72,
157
- "Web Search Dataset (Overall Score)": 53.99,
158
- "Downloads Last Month": 1392,
159
- "Num Likes": 4,
160
- "License": "apache-2.0"
161
  }
162
  ]
 
1
  [
2
  {
3
+ "Model": "Alibaba-NLP/gte-multilingual-base",
 
4
  "Revision": "main",
5
+ "Precision": "f16",
6
  "Task": "Retriever",
7
+ "Average Score": 61.02,
8
+ "Context Length": 8192,
9
+ "Embedding Dimension": 768,
10
+ "Model Size (MB)": 582.44,
11
+ "Number of Parameters (Billions)": 0.305,
12
+ "Web Search Dataset": 80.2,
13
+ "Islamic Knowledge Dataset": 41.84,
14
+ "Downloads Last Month": 1340501,
15
+ "Num Likes": 233,
16
  "License": "apache-2.0"
17
  },
18
  {
19
+ "Model": "NAMAA-Space/AraModernBert-Base-STS",
 
20
  "Revision": "main",
21
+ "Precision": "f32",
22
  "Task": "Retriever",
23
+ "Average Score": 49.99,
24
+ "Context Length": 512,
25
  "Embedding Dimension": 768,
26
+ "Model Size (MB)": 568.19,
27
+ "Number of Parameters (Billions)": 0.149,
28
+ "Web Search Dataset": 37.9,
29
+ "Islamic Knowledge Dataset": 62.08,
30
+ "Downloads Last Month": 205,
31
+ "Num Likes": 6,
 
32
  "License": "apache-2.0"
33
  },
34
  {
35
+ "Model": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka",
 
36
  "Revision": "main",
37
+ "Precision": "f32",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "Task": "Retriever",
39
+ "Average Score": 42.54,
40
+ "Context Length": 512,
41
  "Embedding Dimension": 768,
42
+ "Model Size (MB)": 515.72,
43
+ "Number of Parameters (Billions)": 0.135,
44
+ "Web Search Dataset": 44.49,
45
+ "Islamic Knowledge Dataset": 40.59,
46
+ "Downloads Last Month": 697,
47
+ "Num Likes": 10,
 
48
  "License": "apache-2.0"
49
  },
50
  {
51
+ "Model": "Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
 
52
  "Revision": "main",
53
+ "Precision": "f32",
54
  "Task": "Retriever",
55
+ "Average Score": 55.14,
56
+ "Context Length": 512,
57
  "Embedding Dimension": 768,
58
  "Model Size (MB)": 515.72,
59
+ "Number of Parameters (Billions)": 0.135,
60
+ "Web Search Dataset": 50.93,
61
+ "Islamic Knowledge Dataset": 59.35,
62
+ "Downloads Last Month": 8143,
63
+ "Num Likes": 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  "License": "apache-2.0"
65
  },
66
  {
67
+ "Model": "Omartificial-Intelligence-Space/GATE-AraBert-v1",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "Revision": "main",
69
+ "Precision": "f16",
70
  "Task": "Retriever",
71
+ "Average Score": 53.53,
72
+ "Context Length": 512,
73
+ "Embedding Dimension": 768,
74
+ "Model Size (MB)": 257.86,
75
+ "Number of Parameters (Billions)": 0.135,
76
+ "Web Search Dataset": 50.97,
77
+ "Islamic Knowledge Dataset": 56.09,
78
+ "Downloads Last Month": 3885,
79
+ "Num Likes": 12,
80
  "License": "apache-2.0"
81
  },
82
  {
83
+ "Model": "mohamed2811/Muffakir_Embedding",
 
84
  "Revision": "main",
85
+ "Precision": "f32",
86
  "Task": "Retriever",
87
+ "Average Score": 60.03,
88
+ "Context Length": 512,
89
  "Embedding Dimension": 768,
90
  "Model Size (MB)": 515.72,
91
+ "Number of Parameters (Billions)": 0.135,
92
+ "Web Search Dataset": 54.5,
93
+ "Islamic Knowledge Dataset": 65.56,
94
+ "Downloads Last Month": 615,
95
+ "Num Likes": 1,
96
+ "License": "N/A"
 
97
  },
98
  {
99
+ "Model": "omarelshehy/Arabic-STS-Matryoshka-V2",
 
100
  "Revision": "main",
101
+ "Precision": "f16",
102
  "Task": "Retriever",
103
+ "Average Score": 52.38,
104
+ "Context Length": 512,
105
  "Embedding Dimension": 768,
106
+ "Model Size (MB)": 257.86,
107
+ "Number of Parameters (Billions)": 0.135,
108
+ "Web Search Dataset": 47.25,
109
+ "Islamic Knowledge Dataset": 57.5,
110
+ "Downloads Last Month": 263,
111
+ "Num Likes": 1,
112
+ "License": "N/A"
 
113
  }
114
  ]
retrieval_leaderboard.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from utils import load_json_results
3
+ from leaderboard_tab import search_leaderboard, update_columns_to_show, create_leaderboard_tab
4
+
5
+ # Constants
6
+ RETRIEVAL_ABOUT_SECTION = """
7
+ ## About Retrieval Evaluation
8
+
9
+ The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
10
+
11
+ ### Web Search Dataset Metrics
12
+ - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
13
+ - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
14
+ - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
15
+ - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
16
+
17
+ ### Model Requirements
18
+ - Must support Arabic text embeddings
19
+ - Should handle queries of at least 512 tokens
20
+ - Must work with `sentence-transformers` library
21
+
22
+ ### Evaluation Process
23
+ 1. Models process Arabic web search queries
24
+ 2. Retrieved documents are evaluated using:
25
+ - MRR for first relevant result positioning
26
+ - nDCG for overall ranking quality
27
+ - Recall@5 for top results accuracy
28
+ 3. Metrics are averaged to calculate the overall score
29
+ 4. Models are ranked based on their overall performance
30
+
31
+ ### How to Prepare Your Model
32
+ - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
33
+ - Model should output fixed-dimension embeddings for text
34
+ - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
35
+ """
36
+
37
+ # Global variables
38
+ retrieval_df = None
39
+
40
+ def load_retrieval_results(prepare_for_display=False, sort_col=None, drop_cols=None):
41
+ dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json"
42
+ return load_json_results(
43
+ dataframe_path,
44
+ prepare_for_display=prepare_for_display,
45
+ sort_col=sort_col,
46
+ drop_cols=drop_cols
47
+ )
48
+
49
+ def load_retrieval_leaderboard():
50
+ """Load and prepare the retrieval leaderboard data"""
51
+ global retrieval_df
52
+
53
+ # Prepare retrieval dataframe
54
+ retrieval_df = load_retrieval_results(True, "Average Score", drop_cols=["Revision", "Precision", "Task"])
55
+ retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))
56
+
57
+ return retrieval_df
58
+
59
+ def retrieval_search_leaderboard(model_name, columns_to_show):
60
+ """Search function for retrieval leaderboard"""
61
+ return search_leaderboard(retrieval_df, model_name, columns_to_show)
62
+
63
+ def update_retrieval_columns_to_show(columns_to_show):
64
+ """Update displayed columns for retrieval leaderboard"""
65
+ return update_columns_to_show(retrieval_df, columns_to_show)
66
+
67
+ def create_retrieval_tab():
68
+ """Create the complete retrieval leaderboard tab"""
69
+ global retrieval_df
70
+
71
+ # Load data if not already loaded
72
+ if retrieval_df is None:
73
+ retrieval_df = load_retrieval_leaderboard()
74
+
75
+ # Define default columns to show
76
+ default_columns = ["Rank", "Model", "Average Score", "Model Size (MB)", "Context Length",
77
+ "Embedding Dimension", "Web Search Dataset", "Islamic Knowledge Dataset"]
78
+
79
+ # Create and return the tab
80
+ return create_leaderboard_tab(
81
+ df=retrieval_df,
82
+ initial_columns_to_show=default_columns,
83
+ search_function=retrieval_search_leaderboard,
84
+ update_function=update_retrieval_columns_to_show,
85
+ about_section=RETRIEVAL_ABOUT_SECTION,
86
+ task_type="Retriever"
87
+ )
utils.py CHANGED
@@ -12,8 +12,11 @@ DATASET_REPO_ID = f"{OWNER}/requests-dataset"
12
 
13
  results_dir = Path(__file__).parent / "results"
14
 
15
- # Cache the HF token to avoid multiple os.environ lookups.
16
- HF_TOKEN = os.environ.get('HF_TOKEN', None)
 
 
 
17
 
18
  # Add a helper to load JSON results with optional formatting.
19
  def load_json_results(file_path: Path, prepare_for_display=False, sort_col=None, drop_cols=None):
@@ -30,24 +33,6 @@ def load_json_results(file_path: Path, prepare_for_display=False, sort_col=None,
30
  df.sort_values(sort_col, ascending=False, inplace=True)
31
  return df
32
 
33
- def load_retrieval_results(prepare_for_display=False, sort_col=None, drop_cols=None):
34
- dataframe_path = results_dir / "retrieval_results.json"
35
- return load_json_results(
36
- dataframe_path,
37
- prepare_for_display=prepare_for_display,
38
- sort_col=sort_col,
39
- drop_cols=drop_cols
40
- )
41
-
42
- def load_reranking_results(prepare_for_display=False, sort_col=None, drop_cols=None):
43
- dataframe_path = results_dir / "reranking_results.json"
44
- return load_json_results(
45
- dataframe_path,
46
- prepare_for_display=prepare_for_display,
47
- sort_col=sort_col,
48
- drop_cols=drop_cols
49
- )
50
-
51
  def get_model_info(model_id, verbose=False):
52
  model_info = api.model_info(model_id)
53
  num_downloads = model_info.downloads
@@ -71,16 +56,12 @@ def fetch_model_information(model_name):
71
  return
72
  return gr.update(choices=supported_precisions, value=supported_precisions[0]), license, num_parameters, num_downloads, num_likes
73
 
74
- def submit_model(model_name, revision, precision, params, license, task):
75
- # Load existing evaluations
76
- if task == "Retriever":
77
- df = load_retrieval_results()
78
- elif task == "Reranker":
79
- df = load_reranking_results()
80
- else:
81
- return "Task is not supported πŸ€·β€β™‚οΈ"
82
-
83
- existing_models_results = df[['Model', 'Revision', 'Precision', 'Task']]
84
 
85
  # Handle 'Missing' precision
86
  if precision == 'Missing':
@@ -92,14 +73,6 @@ def submit_model(model_name, revision, precision, params, license, task):
92
  df_pending = load_requests('pending')
93
  df_finished = load_requests('finished')
94
 
95
- # Check if model is already evaluated
96
- model_exists_in_results = ((existing_models_results['Model'] == model_name) &
97
- (existing_models_results['Revision'] == revision) &
98
- (existing_models_results['Precision'] == precision.capitalize()) &
99
- (existing_models_results['Task'] == task)).any()
100
- if model_exists_in_results:
101
- return f"Model {model_name} has already been evaluated as a {task} πŸŽ‰"
102
-
103
  # Check if model is in pending requests
104
  if not df_pending.empty:
105
  existing_models_pending = df_pending[['model_name', 'revision', 'precision', 'task']]
@@ -108,7 +81,7 @@ def submit_model(model_name, revision, precision, params, license, task):
108
  (existing_models_pending['precision'] == precision.capitalize()) &
109
  (existing_models_pending['task'] == task)).any()
110
  if model_exists_in_pending:
111
- return f"Model {model_name} is already in the evaluation queue as a {task} πŸš€"
112
 
113
  # Check if model is in finished requests
114
  if not df_finished.empty:
@@ -267,11 +240,6 @@ def submit_gradio_module(task_type):
267
  inputs=[model_name_input],
268
  outputs=fetch_outputs
269
  )
270
- submit_button.click(
271
- submit_model,
272
- inputs=[model_name_input, revision_input, precision_input, params_input, license_input, var],
273
- outputs=submission_result
274
- )
275
 
276
  # Load pending, finished, and failed requests
277
  df_pending = load_requests('pending', task_type)
@@ -282,9 +250,10 @@ def submit_gradio_module(task_type):
282
  gr.Markdown("## Evaluation Status")
283
  with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
284
  if not df_pending.empty:
285
- gr.Dataframe(df_pending)
286
  else:
287
  gr.Markdown("No pending evaluations.")
 
288
  with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
289
  if not df_finished.empty:
290
  gr.Dataframe(df_finished)
@@ -295,3 +264,9 @@ def submit_gradio_module(task_type):
295
  gr.Dataframe(df_failed)
296
  else:
297
  gr.Markdown("No failed evaluations.")
 
 
 
 
 
 
 
12
 
13
  results_dir = Path(__file__).parent / "results"
14
 
15
+ # Replace the current HF_TOKEN line with this to add a helpful error message if token is missing
16
+ HF_TOKEN = os.environ.get('HF_TOKEN')
17
+ if not HF_TOKEN:
18
+ print("Warning: HF_TOKEN environment variable not set. API operations requiring authentication will fail.")
19
+ HF_TOKEN = None
20
 
21
  # Add a helper to load JSON results with optional formatting.
22
  def load_json_results(file_path: Path, prepare_for_display=False, sort_col=None, drop_cols=None):
 
33
  df.sort_values(sort_col, ascending=False, inplace=True)
34
  return df
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def get_model_info(model_id, verbose=False):
37
  model_info = api.model_info(model_id)
38
  num_downloads = model_info.downloads
 
56
  return
57
  return gr.update(choices=supported_precisions, value=supported_precisions[0]), license, num_parameters, num_downloads, num_likes
58
 
59
+ def submit_model(model_name, revision, precision, params, license, task, pending_gradio_df):
60
+ try:
61
+ if float(params) > 5000:
62
+ return "Model size should be less than 5000 million parameters (5 billion) πŸ‘€", pending_gradio_df
63
+ except ValueError:
64
+ gr.Error("The parameter count is not present or is not a number. Please make sure its available and its correct"),
 
 
 
 
65
 
66
  # Handle 'Missing' precision
67
  if precision == 'Missing':
 
73
  df_pending = load_requests('pending')
74
  df_finished = load_requests('finished')
75
 
 
 
 
 
 
 
 
 
76
  # Check if model is in pending requests
77
  if not df_pending.empty:
78
  existing_models_pending = df_pending[['model_name', 'revision', 'precision', 'task']]
 
81
  (existing_models_pending['precision'] == precision.capitalize()) &
82
  (existing_models_pending['task'] == task)).any()
83
  if model_exists_in_pending:
84
+ return f"Model {model_name} is already in the evaluation queue as a {task} πŸš€", pending_gradio_df
85
 
86
  # Check if model is in finished requests
87
  if not df_finished.empty:
 
240
  inputs=[model_name_input],
241
  outputs=fetch_outputs
242
  )
 
 
 
 
 
243
 
244
  # Load pending, finished, and failed requests
245
  df_pending = load_requests('pending', task_type)
 
250
  gr.Markdown("## Evaluation Status")
251
  with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
252
  if not df_pending.empty:
253
+ pending_gradio_df = gr.Dataframe(df_pending)
254
  else:
255
  gr.Markdown("No pending evaluations.")
256
+ pending_gradio_df = None
257
  with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
258
  if not df_finished.empty:
259
  gr.Dataframe(df_finished)
 
264
  gr.Dataframe(df_failed)
265
  else:
266
  gr.Markdown("No failed evaluations.")
267
+
268
+ submit_button.click(
269
+ submit_model,
270
+ inputs=[model_name_input, revision_input, precision_input, params_input, license_input, var, pending_gradio_df],
271
+ outputs=[submission_result, pending_gradio_df],
272
+ )