Spaces:
Sleeping
Sleeping
Commit
·
8ac76ef
1
Parent(s):
62268f6
new evaluations
Browse files- README.md +0 -0
- app.py +0 -0
- data/fetch_arxiv.py +0 -0
- data/fetch_miccai.py +0 -0
- data/fetch_nature.py +0 -0
- data/fetch_processed.py +0 -0
- evaluations/documentation.py +11 -0
- evaluations/license.py +0 -0
- evaluations/models.py +1 -1
- evaluations/pitfalls.py +0 -0
- evaluations/repo_evaluations.py +46 -44
- evaluations/requirements.py +0 -0
- evaluations/training.py +0 -0
- evaluations/utils.py +0 -0
- evaluations/validating.py +0 -0
- evaluations/weights.py +0 -0
- full_eval.py +0 -0
- midl.py +1 -1
- plotting/paper_plots.py +0 -0
- plotting/result_plots.py +0 -0
- requirements.txt +0 -0
README.md
CHANGED
File without changes
|
app.py
CHANGED
File without changes
|
data/fetch_arxiv.py
CHANGED
File without changes
|
data/fetch_miccai.py
CHANGED
File without changes
|
data/fetch_nature.py
CHANGED
File without changes
|
data/fetch_processed.py
CHANGED
File without changes
|
evaluations/documentation.py
CHANGED
@@ -2,6 +2,17 @@ from .utils import log,fetch_code
|
|
2 |
import re
|
3 |
import numpy as np
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def evaluate(verbose, llm, zip, readme):
|
6 |
log(verbose, "TITLE", "\nEvaluating code documentation...")
|
7 |
overall = "No"
|
|
|
2 |
import re
|
3 |
import numpy as np
|
4 |
|
5 |
+
def is_applicable(verbose, llm, readme):
|
6 |
+
applicable = "NA/NA/NA"
|
7 |
+
if (llm):
|
8 |
+
log(verbose, "TITLE", "\nChecking what parts of the evaluations are applicable...")
|
9 |
+
res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
|
10 |
+
res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
|
11 |
+
res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
|
12 |
+
|
13 |
+
applicable = f"{res_training}/{res_evaluation}/{res_weights}"
|
14 |
+
return applicable
|
15 |
+
|
16 |
def evaluate(verbose, llm, zip, readme):
|
17 |
log(verbose, "TITLE", "\nEvaluating code documentation...")
|
18 |
overall = "No"
|
evaluations/license.py
CHANGED
File without changes
|
evaluations/models.py
CHANGED
@@ -33,7 +33,7 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
|
|
33 |
|
34 |
class LocalLLM():
|
35 |
def __init__(self, model_name):
|
36 |
-
self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000,
|
37 |
|
38 |
def predict(self, response_type, prompt):
|
39 |
messages = [
|
|
|
33 |
|
34 |
class LocalLLM():
|
35 |
def __init__(self, model_name):
|
36 |
+
self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000, device=0, pad_token_id=128001)
|
37 |
|
38 |
def predict(self, response_type, prompt):
|
39 |
messages = [
|
evaluations/pitfalls.py
CHANGED
File without changes
|
evaluations/repo_evaluations.py
CHANGED
@@ -8,59 +8,61 @@ import numpy as np
|
|
8 |
from huggingface_hub import InferenceClient
|
9 |
|
10 |
def evaluate(llm, verbose, repo_url, title=None, year=None):
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
return results
|
24 |
|
25 |
-
|
26 |
-
|
27 |
|
28 |
-
|
29 |
|
30 |
-
if ((title != None) & (year != None) & (title != "") & (year != "")):
|
31 |
-
res = fetch_openalex(verbose, title, year)
|
32 |
-
if (res != None):
|
33 |
-
res = res["results"]
|
34 |
-
if (len(res) > 0):
|
35 |
-
res = res[0]
|
36 |
-
results["pred_citations"] = res["cited_by_count"]
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
|
|
44 |
|
45 |
-
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
log(verbose, "LOG", "The repository is empty.")
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
|
65 |
def full_evaluation():
|
66 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
@@ -82,7 +84,7 @@ def full_evaluation():
|
|
82 |
def midl_evaluations(model):
|
83 |
compare_to_gt = True
|
84 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
85 |
-
verbose =
|
86 |
|
87 |
eval_readme = []
|
88 |
eval_training = []
|
@@ -101,7 +103,6 @@ def midl_evaluations(model):
|
|
101 |
if (pd.isna(row["url"]) | (row["url"] == "")):
|
102 |
continue
|
103 |
|
104 |
-
|
105 |
print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
|
106 |
print(f'Paper title - "{row["title"]}" ({row["year"]})')
|
107 |
print(f'Repository link - {row["url"]}')
|
@@ -111,6 +112,7 @@ def midl_evaluations(model):
|
|
111 |
full_results.append(row)
|
112 |
if (compare_to_gt):
|
113 |
print("\nSummary:")
|
|
|
114 |
if ((~pd.isna(row["dependencies"])) & (row["pred_dependencies"] is not None)):
|
115 |
eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
|
116 |
print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
|
|
|
8 |
from huggingface_hub import InferenceClient
|
9 |
|
10 |
def evaluate(llm, verbose, repo_url, title=None, year=None):
|
11 |
+
repository_zip_name = "data/repo.zip"
|
12 |
+
token = os.getenv("githubToken")
|
13 |
|
14 |
+
try:
|
15 |
+
if (not(llm)):
|
16 |
+
log(verbose, "LOG", "No LLM will be used for the evaluation.")
|
17 |
|
18 |
+
results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
|
19 |
+
|
20 |
+
if ((title != None) & (year != None) & (title != "") & (year != "")):
|
21 |
+
res = fetch_openalex(verbose, title, year)
|
22 |
+
if ((res != None)):
|
23 |
+
res = res["results"]
|
24 |
+
if (len(res) > 0):
|
25 |
+
res = res[0]
|
26 |
+
results["pred_citations"] = res["cited_by_count"]
|
27 |
|
28 |
+
if (get_api_link(repo_url) != ""):
|
29 |
+
results["pred_valid"] = True
|
30 |
+
else:
|
31 |
+
return results
|
|
|
32 |
|
33 |
+
username, repo_name = decompose_url(repo_url)
|
34 |
+
log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
|
35 |
|
36 |
+
fetch_repo(verbose, repo_url, repository_zip_name, token)
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
if (not(os.path.exists(repository_zip_name))):
|
40 |
+
results["pred_live"] = "No"
|
41 |
+
return results
|
42 |
|
43 |
+
zip = zipfile.ZipFile(repository_zip_name)
|
44 |
+
readme = fetch_readme(zip)
|
45 |
+
results["NA"] = documentation.is_applicable(verbose, llm, readme)
|
46 |
|
47 |
+
results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
|
48 |
+
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
49 |
|
50 |
+
if (len(zip.namelist()) <= 2):
|
51 |
+
log(verbose, "LOG", "The repository is empty.")
|
|
|
52 |
|
53 |
+
results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
|
54 |
+
results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
|
55 |
+
results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
|
56 |
+
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
|
57 |
+
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
|
58 |
+
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
59 |
+
pitfalls.evaluate(verbose, llm, zip, readme)
|
60 |
+
|
61 |
+
return results
|
62 |
+
except Exception as e:
|
63 |
+
log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
|
64 |
+
results["pred_live"] = "No"
|
65 |
+
return results
|
66 |
|
67 |
def full_evaluation():
|
68 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
|
|
84 |
def midl_evaluations(model):
|
85 |
compare_to_gt = True
|
86 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
87 |
+
verbose = 0
|
88 |
|
89 |
eval_readme = []
|
90 |
eval_training = []
|
|
|
103 |
if (pd.isna(row["url"]) | (row["url"] == "")):
|
104 |
continue
|
105 |
|
|
|
106 |
print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
|
107 |
print(f'Paper title - "{row["title"]}" ({row["year"]})')
|
108 |
print(f'Repository link - {row["url"]}')
|
|
|
112 |
full_results.append(row)
|
113 |
if (compare_to_gt):
|
114 |
print("\nSummary:")
|
115 |
+
print(row["NA"])
|
116 |
if ((~pd.isna(row["dependencies"])) & (row["pred_dependencies"] is not None)):
|
117 |
eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
|
118 |
print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
|
evaluations/requirements.py
CHANGED
File without changes
|
evaluations/training.py
CHANGED
File without changes
|
evaluations/utils.py
CHANGED
File without changes
|
evaluations/validating.py
CHANGED
File without changes
|
evaluations/weights.py
CHANGED
File without changes
|
full_eval.py
CHANGED
File without changes
|
midl.py
CHANGED
@@ -6,7 +6,7 @@ load_dotenv()
|
|
6 |
token = os.getenv("githubToken")
|
7 |
|
8 |
# Load model directly
|
9 |
-
model = LocalLLM("meta-llama/Llama-3.
|
10 |
|
11 |
res = midl_evaluations(model)
|
12 |
res.to_csv("results_midl.csv", sep="\t", index=False)
|
|
|
6 |
token = os.getenv("githubToken")
|
7 |
|
8 |
# Load model directly
|
9 |
+
model = LocalLLM("meta-llama/Llama-3.2-3B-Instruct")
|
10 |
|
11 |
res = midl_evaluations(model)
|
12 |
res.to_csv("results_midl.csv", sep="\t", index=False)
|
plotting/paper_plots.py
CHANGED
File without changes
|
plotting/result_plots.py
CHANGED
File without changes
|
requirements.txt
CHANGED
File without changes
|