attilasimko commited on
Commit
3cfadc8
·
1 Parent(s): 2188124

new evaluations

Browse files
.gitattributes CHANGED
File without changes
.gitignore CHANGED
@@ -1,5 +1,10 @@
1
- data/*.csv
 
 
 
 
2
  data/*.zip
 
3
  *.env
4
  .env
5
  evaluations/__pycache__/*
 
1
+ data/MIDL.csv
2
+ data/MICCAI.csv
3
+ data/arXiv.csv
4
+ data/Nature.csv
5
+ data/results.csv
6
  data/*.zip
7
+ data/test/*
8
  *.env
9
  .env
10
  evaluations/__pycache__/*
data/dump.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/fetch_processed.py CHANGED
@@ -9,6 +9,7 @@ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
9
 
10
  for venue in custom_order:
11
  df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
 
12
  df.to_csv(f'data/{venue}.csv', sep="\t")
13
 
14
  # Store all evaluations here
 
9
 
10
  for venue in custom_order:
11
  df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
12
+ df = df.replace('\t', ' ', regex=True)
13
  df.to_csv(f'data/{venue}.csv', sep="\t")
14
 
15
  # Store all evaluations here
data/fetch_zips.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import numpy as np
3
+ import sys
4
+ import pandas as pd
5
+ import re
6
+ sys.path.append("./")
7
+ from evaluations.utils import *
8
+
9
+ token = os.getenv("githubToken")
10
+ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
11
+
12
+ for venue in custom_order:
13
+ df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
14
+ df = df.replace('\t', ' ', regex=True)
15
+ df.to_csv(f'data/{venue}.csv', sep="\t")
16
+
17
+ # Store all evaluations here
18
+ paper_dump = pd.DataFrame()
19
+ # Official color codes for conferences
20
+ zip_idx = 0
21
+
22
+ for venue in custom_order:
23
+ with open(f'data/{venue}.csv') as file:
24
+ tsv_file = csv.reader(file, delimiter="\t")
25
+ for row in tsv_file:
26
+ if (row[0] == ""):
27
+ continue
28
+
29
+ if (row[1] == ""):
30
+ continue
31
+
32
+ repo_url = row[4]
33
+ username, repo_name = decompose_url(repo_url)
34
+ repo_save_name = f"repo_{zip_idx}.zip"
35
+ repository_zip_name = f"data/test/{repo_save_name}"
36
+ log(0, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
37
+ fetch_repo(0, repo_url, repository_zip_name, token)
38
+
39
+ if (os.path.exists(repository_zip_name)):
40
+ paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]], "zip_idx": [ repository_zip_name ]})], ignore_index=True)
41
+ zip_idx += 1
42
+
43
+ paper_dump.to_csv(f'data/zipfiles.csv', sep="\t")
data/zipfiles.csv ADDED
The diff for this file is too large to render. See raw diff
 
evaluations/repo_evaluations.py CHANGED
@@ -7,10 +7,7 @@ import os
7
  import numpy as np
8
  from huggingface_hub import InferenceClient
9
 
10
- def evaluate(llm, verbose, repo_url, title=None, year=None):
11
- repository_zip_name = "data/repo.zip"
12
- token = os.getenv("githubToken")
13
-
14
  try:
15
  if (not(llm)):
16
  log(verbose, "LOG", "No LLM will be used for the evaluation.")
@@ -31,20 +28,26 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
31
  return results
32
 
33
  username, repo_name = decompose_url(repo_url)
34
- log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
35
 
36
- fetch_repo(verbose, repo_url, repository_zip_name, token)
 
 
 
 
37
 
 
38
 
39
- if (not(os.path.exists(repository_zip_name))):
40
- results["pred_live"] = "No"
41
- return results
 
 
 
 
42
 
43
- zip = zipfile.ZipFile(repository_zip_name)
44
  readme = fetch_readme(zip)
45
  results["NA"] = documentation.is_applicable(verbose, llm, readme)
46
 
47
- results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
48
  results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
49
 
50
  if (len(zip.namelist()) <= 2):
@@ -65,7 +68,7 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
65
  return results
66
 
67
  def full_evaluation():
68
- paper_dump = pd.read_csv("data/dump.csv", sep="\t")
69
  full_results = []
70
 
71
  for idx, row in paper_dump.iterrows():
@@ -74,62 +77,9 @@ def full_evaluation():
74
  continue
75
 
76
  print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
77
- result = evaluate(None, False, row["url"], row["title"], row["year"])
78
  for column in result.keys():
79
  row[column] = result[column]
80
 
81
  full_results.append(row)
82
  return pd.DataFrame(full_results)
83
-
84
- def midl_evaluations(model):
85
- compare_to_gt = True
86
- paper_dump = pd.read_csv("data/dump.csv", sep="\t")
87
- verbose = 0
88
-
89
- eval_readme = []
90
- eval_training = []
91
- eval_evaluating = []
92
- eval_licensing = []
93
- eval_weights = []
94
- eval_dependencies = []
95
- full_results = []
96
- for idx, row in paper_dump.iterrows():
97
- if (row["venue"] != "MIDL"):
98
- continue
99
-
100
- if (row["venue"] == 2024):
101
- continue
102
-
103
- if (pd.isna(row["url"]) | (row["url"] == "")):
104
- continue
105
-
106
- print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
107
- print(f'Paper title - "{row["title"]}" ({row["year"]})')
108
- print(f'Repository link - {row["url"]}')
109
- result = evaluate(model, verbose, row["url"])
110
- for column in result.keys():
111
- row[column] = result[column]
112
- full_results.append(row)
113
- if (compare_to_gt):
114
- print("\nSummary:")
115
- print(row["NA"])
116
- if ((~pd.isna(row["dependencies"])) & (row["pred_dependencies"] is not None)):
117
- eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
118
- print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
119
- if ((~pd.isna(row["training"])) & (row["pred_dependencies"] is not None)):
120
- eval_training.append(row["training"] == row["pred_training"])
121
- print(f"Training acc. -{row['pred_training']} (GT:{row['training']}) / {int(100 * np.mean(eval_training))}%")
122
- if ((~pd.isna(row["evaluation"])) & (row["pred_dependencies"] is not None)):
123
- eval_evaluating.append(row["evaluation"] == row["pred_evaluation"])
124
- print(f"Evaluating acc. - {row['pred_evaluation']} (GT:{row['evaluation']}) / {int(100 * np.mean(eval_evaluating))}%")
125
- if ((~pd.isna(row["weights"])) & (row["pred_dependencies"] is not None)):
126
- eval_weights.append(row["weights"] == row["pred_weights"])
127
- print(f"Weights acc. - {row['pred_weights']} (GT:{row['weights']}) / {int(100 * np.mean(eval_weights))}%")
128
- if ((~pd.isna(row["readme"])) & (row["pred_dependencies"] is not None)):
129
- eval_readme.append(row["readme"] == row["pred_readme"])
130
- print(f"README acc. - {row['pred_readme']} (GT:{row['readme']}) / {int(100 * np.mean(eval_readme))}%")
131
- if ((~pd.isna(row["license"])) & (row["pred_dependencies"] is not None)):
132
- eval_licensing.append(("No" if row["license"] == "No" else "Yes") == row["pred_license"])
133
- print(f"LICENSE acc. - {row['pred_license']} (GT:{row['license']}) / {int(100 * np.mean(eval_licensing))}%")
134
-
135
- return pd.DataFrame(full_results)
 
7
  import numpy as np
8
  from huggingface_hub import InferenceClient
9
 
10
+ def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
 
 
 
11
  try:
12
  if (not(llm)):
13
  log(verbose, "LOG", "No LLM will be used for the evaluation.")
 
28
  return results
29
 
30
  username, repo_name = decompose_url(repo_url)
 
31
 
32
+ # If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
33
+ if (zip is None):
34
+ token = os.getenv("githubToken")
35
+ repository_zip_name = "data/repo.zip"
36
+ log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
37
 
38
+ fetch_repo(verbose, repo_url, repository_zip_name, token)
39
 
40
+ if (not(os.path.exists(repository_zip_name))):
41
+ results["pred_live"] = "No"
42
+ return results
43
+
44
+ results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
45
+
46
+ zip = zipfile.ZipFile(repository_zip_name)
47
 
 
48
  readme = fetch_readme(zip)
49
  results["NA"] = documentation.is_applicable(verbose, llm, readme)
50
 
 
51
  results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
52
 
53
  if (len(zip.namelist()) <= 2):
 
68
  return results
69
 
70
  def full_evaluation():
71
+ paper_dump = pd.read_csv("data/zipfiles.csv", sep="\t")
72
  full_results = []
73
 
74
  for idx, row in paper_dump.iterrows():
 
77
  continue
78
 
79
  print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
80
+ result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
81
  for column in result.keys():
82
  row[column] = result[column]
83
 
84
  full_results.append(row)
85
  return pd.DataFrame(full_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluations/utils.py CHANGED
@@ -51,7 +51,6 @@ def fetch_repo(verbose, repo_url, repo_name, token):
51
  if (os.path.exists(repo_name)):
52
  os.remove(repo_name)
53
 
54
-
55
  if ("github.com" not in repo_url):
56
  log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
57
  return
 
51
  if (os.path.exists(repo_name)):
52
  os.remove(repo_name)
53
 
 
54
  if ("github.com" not in repo_url):
55
  log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
56
  return
full_eval.py CHANGED
@@ -1,11 +1,4 @@
1
  from evaluations.repo_evaluations import full_evaluation
2
- # importing os module for environment variables
3
- import os
4
- # importing necessary functions from dotenv library
5
- from dotenv import load_dotenv
6
- # loading variables from .env file
7
- load_dotenv()
8
- token = os.getenv("githubToken")
9
 
10
  res = full_evaluation()
11
  res.to_csv("data/results.csv", sep="\t", index=False)
 
1
  from evaluations.repo_evaluations import full_evaluation
 
 
 
 
 
 
 
2
 
3
  res = full_evaluation()
4
  res.to_csv("data/results.csv", sep="\t", index=False)
midl.py DELETED
@@ -1,12 +0,0 @@
1
- from evaluations.repo_evaluations import midl_evaluations
2
- from evaluations.models import LocalLLM
3
- import os
4
- from dotenv import load_dotenv
5
- load_dotenv()
6
- token = os.getenv("githubToken")
7
-
8
- # Load model directly
9
- model = LocalLLM("meta-llama/Llama-3.2-3B-Instruct")
10
-
11
- res = midl_evaluations(model)
12
- res.to_csv("results_midl.csv", sep="\t", index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
midl_summary.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+
6
+ compare_to_gt = True
7
+ ground_truth = pd.read_csv("data/zipfiles.csv", sep="\t")
8
+ results = pd.read_csv("data/results.csv", sep="\t")
9
+ verbose = 0
10
+
11
+ eval_readme = []
12
+ eval_training = []
13
+ eval_evaluating = []
14
+ eval_licensing = []
15
+ eval_weights = []
16
+ eval_dependencies = []
17
+ full_results = []
18
+ for (index1, row1), (index2, row2) in zip(ground_truth.iterrows(), results.iterrows()):
19
+ if (pd.isna(row1["training"])):
20
+ continue
21
+
22
+ print(f"\nEvaluating {index1+1} out of {len(ground_truth.index)} papers...")
23
+ print(f'Paper title - "{row1["title"]}" ({row1["year"]})')
24
+ print(f'Repository link - {row1["url"]}')
25
+ if ((not(pd.isna(row1["dependencies"]))) & (row2["pred_dependencies"] is not None)):
26
+ eval_dependencies.append(row2["pred_dependencies"] == row1["dependencies"])
27
+ if (row2["pred_dependencies"] != row1["dependencies"]):
28
+ print(f"Dependencies acc. - {row2['pred_dependencies']} (GT:{row1['dependencies']})")
29
+ if ((not(pd.isna(row1["training"]))) & (row2["pred_dependencies"] is not None)):
30
+ eval_training.append(row1["training"] == row2["pred_training"])
31
+ if (row1["training"] != row2["pred_training"]):
32
+ print(f"Training acc. -{row2['pred_training']} (GT:{row1['training']})")
33
+ if ((not(pd.isna(row1["evaluation"]))) & (row2["pred_dependencies"] is not None)):
34
+ eval_evaluating.append(row1["evaluation"] == row2["pred_evaluation"])
35
+ if (row1["evaluation"] != row2["pred_evaluation"]):
36
+ print(f"Evaluating acc. - {row2['pred_evaluation']} (GT:{row1['evaluation']})")
37
+ if ((not(pd.isna(row1["weights"]))) & (row2["pred_dependencies"] is not None)):
38
+ eval_weights.append(row1["weights"] == row2["pred_weights"])
39
+ if (row1["weights"] != row2["pred_weights"]):
40
+ print(f"Weights acc. - {row2['pred_weights']} (GT:{row1['weights']})")
41
+ if ((not(pd.isna(row1["readme"]))) & (row2["pred_dependencies"] is not None)):
42
+ eval_readme.append(row1["readme"] == row2["pred_readme"])
43
+ if (row1["readme"] != row2["pred_readme"]):
44
+ print(f"README acc. - {row2['pred_readme']} (GT:{row1['readme']})")
45
+ if ((not(pd.isna(row1["license"]))) & (row2["pred_dependencies"] is not None)):
46
+ eval_licensing.append(("No" if row1["license"] == "No" else "Yes") == row2["pred_license"])
47
+ if (("No" if row1["license"] == "No" else "Yes") != row2["pred_license"]):
48
+ print(f"LICENSE acc. - {row2['pred_license']} (GT:{row1['license']})")
49
+
50
+
51
+ print("\nSummary:")
52
+ print(f"Dependencies acc. - {int(100 * np.mean(eval_dependencies))}%")
53
+ print(f"Training acc. - {int(100 * np.mean(eval_training))}%")
54
+ print(f"Evaluating acc. - {int(100 * np.mean(eval_evaluating))}%")
55
+ print(f"Weights acc. - {int(100 * np.mean(eval_weights))}%")
56
+ print(f"README acc. - {int(100 * np.mean(eval_readme))}%")
57
+ print(f"LICENSE acc. - {int(100 * np.mean(eval_licensing))}%")