attilasimko commited on
Commit
8ac76ef
·
1 Parent(s): 62268f6

new evaluations

Browse files
README.md CHANGED
File without changes
app.py CHANGED
File without changes
data/fetch_arxiv.py CHANGED
File without changes
data/fetch_miccai.py CHANGED
File without changes
data/fetch_nature.py CHANGED
File without changes
data/fetch_processed.py CHANGED
File without changes
evaluations/documentation.py CHANGED
@@ -2,6 +2,17 @@ from .utils import log,fetch_code
2
  import re
3
  import numpy as np
4
 
 
 
 
 
 
 
 
 
 
 
 
5
  def evaluate(verbose, llm, zip, readme):
6
  log(verbose, "TITLE", "\nEvaluating code documentation...")
7
  overall = "No"
 
2
  import re
3
  import numpy as np
4
 
5
+ def is_applicable(verbose, llm, readme):
6
+ applicable = "NA/NA/NA"
7
+ if (llm):
8
+ log(verbose, "TITLE", "\nChecking what parts of the evaluations are applicable...")
9
+ res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
10
+ res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
11
+ res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
12
+
13
+ applicable = f"{res_training}/{res_evaluation}/{res_weights}"
14
+ return applicable
15
+
16
  def evaluate(verbose, llm, zip, readme):
17
  log(verbose, "TITLE", "\nEvaluating code documentation...")
18
  overall = "No"
evaluations/license.py CHANGED
File without changes
evaluations/models.py CHANGED
@@ -33,7 +33,7 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
33
 
34
  class LocalLLM():
35
  def __init__(self, model_name):
36
- self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000, device_map={0: 0})
37
 
38
  def predict(self, response_type, prompt):
39
  messages = [
 
33
 
34
  class LocalLLM():
35
  def __init__(self, model_name):
36
+ self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000, device=0, pad_token_id=128001)
37
 
38
  def predict(self, response_type, prompt):
39
  messages = [
evaluations/pitfalls.py CHANGED
File without changes
evaluations/repo_evaluations.py CHANGED
@@ -8,59 +8,61 @@ import numpy as np
8
  from huggingface_hub import InferenceClient
9
 
10
  def evaluate(llm, verbose, repo_url, title=None, year=None):
11
- repository_zip_name = "data/repo.zip"
12
- token = os.getenv("githubToken")
13
 
14
- if (not(llm)):
15
- log(verbose, "LOG", "No LLM will be used for the evaluation.")
 
16
 
17
- results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
 
 
 
 
 
 
 
 
18
 
19
- try:
20
- if (get_api_link(repo_url) != ""):
21
- results["pred_valid"] = True
22
- else:
23
- return results
24
 
25
- username, repo_name = decompose_url(repo_url)
26
- log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
27
 
28
- fetch_repo(verbose, repo_url, repository_zip_name, token)
29
 
30
- if ((title != None) & (year != None) & (title != "") & (year != "")):
31
- res = fetch_openalex(verbose, title, year)
32
- if (res != None):
33
- res = res["results"]
34
- if (len(res) > 0):
35
- res = res[0]
36
- results["pred_citations"] = res["cited_by_count"]
37
 
38
- if (not(os.path.exists(repository_zip_name))):
39
- results["pred_live"] = "No"
40
- return results
41
 
42
- zip = zipfile.ZipFile(repository_zip_name)
43
- readme = fetch_readme(zip)
 
44
 
45
- results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
 
46
 
47
- results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
48
- if (len(zip.namelist()) <= 2):
49
- log(verbose, "LOG", "The repository is empty.")
50
 
51
- results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
52
- results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
53
- results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
54
- results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
55
- results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
56
- results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
57
- pitfalls.evaluate(verbose, llm, zip, readme)
58
-
59
- return results
60
- except Exception as e:
61
- log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
62
- results["pred_live"] = "No"
63
- return results
64
 
65
  def full_evaluation():
66
  paper_dump = pd.read_csv("data/dump.csv", sep="\t")
@@ -82,7 +84,7 @@ def full_evaluation():
82
  def midl_evaluations(model):
83
  compare_to_gt = True
84
  paper_dump = pd.read_csv("data/dump.csv", sep="\t")
85
- verbose = 1
86
 
87
  eval_readme = []
88
  eval_training = []
@@ -101,7 +103,6 @@ def midl_evaluations(model):
101
  if (pd.isna(row["url"]) | (row["url"] == "")):
102
  continue
103
 
104
-
105
  print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
106
  print(f'Paper title - "{row["title"]}" ({row["year"]})')
107
  print(f'Repository link - {row["url"]}')
@@ -111,6 +112,7 @@ def midl_evaluations(model):
111
  full_results.append(row)
112
  if (compare_to_gt):
113
  print("\nSummary:")
 
114
  if ((~pd.isna(row["dependencies"])) & (row["pred_dependencies"] is not None)):
115
  eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
116
  print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
 
8
  from huggingface_hub import InferenceClient
9
 
10
  def evaluate(llm, verbose, repo_url, title=None, year=None):
11
+ repository_zip_name = "data/repo.zip"
12
+ token = os.getenv("githubToken")
13
 
14
+ try:
15
+ if (not(llm)):
16
+ log(verbose, "LOG", "No LLM will be used for the evaluation.")
17
 
18
+ results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
19
+
20
+ if ((title != None) & (year != None) & (title != "") & (year != "")):
21
+ res = fetch_openalex(verbose, title, year)
22
+ if ((res != None)):
23
+ res = res["results"]
24
+ if (len(res) > 0):
25
+ res = res[0]
26
+ results["pred_citations"] = res["cited_by_count"]
27
 
28
+ if (get_api_link(repo_url) != ""):
29
+ results["pred_valid"] = True
30
+ else:
31
+ return results
 
32
 
33
+ username, repo_name = decompose_url(repo_url)
34
+ log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
35
 
36
+ fetch_repo(verbose, repo_url, repository_zip_name, token)
37
 
 
 
 
 
 
 
 
38
 
39
+ if (not(os.path.exists(repository_zip_name))):
40
+ results["pred_live"] = "No"
41
+ return results
42
 
43
+ zip = zipfile.ZipFile(repository_zip_name)
44
+ readme = fetch_readme(zip)
45
+ results["NA"] = documentation.is_applicable(verbose, llm, readme)
46
 
47
+ results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
48
+ results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
49
 
50
+ if (len(zip.namelist()) <= 2):
51
+ log(verbose, "LOG", "The repository is empty.")
 
52
 
53
+ results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
54
+ results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
55
+ results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
56
+ results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
57
+ results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
58
+ results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
59
+ pitfalls.evaluate(verbose, llm, zip, readme)
60
+
61
+ return results
62
+ except Exception as e:
63
+ log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
64
+ results["pred_live"] = "No"
65
+ return results
66
 
67
  def full_evaluation():
68
  paper_dump = pd.read_csv("data/dump.csv", sep="\t")
 
84
  def midl_evaluations(model):
85
  compare_to_gt = True
86
  paper_dump = pd.read_csv("data/dump.csv", sep="\t")
87
+ verbose = 0
88
 
89
  eval_readme = []
90
  eval_training = []
 
103
  if (pd.isna(row["url"]) | (row["url"] == "")):
104
  continue
105
 
 
106
  print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
107
  print(f'Paper title - "{row["title"]}" ({row["year"]})')
108
  print(f'Repository link - {row["url"]}')
 
112
  full_results.append(row)
113
  if (compare_to_gt):
114
  print("\nSummary:")
115
+ print(row["NA"])
116
  if ((~pd.isna(row["dependencies"])) & (row["pred_dependencies"] is not None)):
117
  eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
118
  print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
evaluations/requirements.py CHANGED
File without changes
evaluations/training.py CHANGED
File without changes
evaluations/utils.py CHANGED
File without changes
evaluations/validating.py CHANGED
File without changes
evaluations/weights.py CHANGED
File without changes
full_eval.py CHANGED
File without changes
midl.py CHANGED
@@ -6,7 +6,7 @@ load_dotenv()
6
  token = os.getenv("githubToken")
7
 
8
  # Load model directly
9
- model = LocalLLM("meta-llama/Llama-3.1-8B-Instruct")
10
 
11
  res = midl_evaluations(model)
12
  res.to_csv("results_midl.csv", sep="\t", index=False)
 
6
  token = os.getenv("githubToken")
7
 
8
  # Load model directly
9
+ model = LocalLLM("meta-llama/Llama-3.2-3B-Instruct")
10
 
11
  res = midl_evaluations(model)
12
  res.to_csv("results_midl.csv", sep="\t", index=False)
plotting/paper_plots.py CHANGED
File without changes
plotting/result_plots.py CHANGED
File without changes
requirements.txt CHANGED
File without changes