Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
3abd747
1
Parent(s):
67b2553
Let's see some logs
Browse files- app.py +1 -1
- evaluations/documentation.py +7 -7
- evaluations/models.py +17 -2
- evaluations/pitfalls.py +2 -37
app.py
CHANGED
@@ -3,7 +3,7 @@ from evaluations.repo_evaluations import evaluate
|
|
3 |
from evaluations.models import RemoteLLM
|
4 |
import requests
|
5 |
|
6 |
-
model = RemoteLLM("meta-llama/Llama-3.1-
|
7 |
|
8 |
st.write("\n")
|
9 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
|
|
3 |
from evaluations.models import RemoteLLM
|
4 |
import requests
|
5 |
|
6 |
+
model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
|
7 |
|
8 |
st.write("\n")
|
9 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
evaluations/documentation.py
CHANGED
@@ -15,6 +15,12 @@ def evaluate(verbose, llm, zip, readme):
|
|
15 |
non_empty_rows = [row for row in readme.split("\n") if row != ""]
|
16 |
if (len(non_empty_rows) < 5):
|
17 |
log(verbose, "ERROR", "Readme file has very few lines")
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
return overall
|
19 |
|
20 |
if (count_code_lines(non_empty_rows) > 5):
|
@@ -22,18 +28,12 @@ def evaluate(verbose, llm, zip, readme):
|
|
22 |
overall = "Yes"
|
23 |
return overall
|
24 |
|
25 |
-
if (llm):
|
26 |
-
code = fetch_code(zip)
|
27 |
-
if (llm):
|
28 |
-
summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
|
29 |
-
log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
|
30 |
|
31 |
if (llm):
|
32 |
prompt = f'{readme}\n \
|
33 |
Is this README file is enough to find what \
|
34 |
package dependencies you need to install and how to train \
|
35 |
-
and evaluate the proposed model?
|
36 |
-
answer yes or no.\n\nA:'
|
37 |
llm.predict("HELP", prompt)
|
38 |
|
39 |
|
|
|
15 |
non_empty_rows = [row for row in readme.split("\n") if row != ""]
|
16 |
if (len(non_empty_rows) < 5):
|
17 |
log(verbose, "ERROR", "Readme file has very few lines")
|
18 |
+
|
19 |
+
if (llm):
|
20 |
+
code = fetch_code(zip)
|
21 |
+
if (llm):
|
22 |
+
summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
|
23 |
+
log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
|
24 |
return overall
|
25 |
|
26 |
if (count_code_lines(non_empty_rows) > 5):
|
|
|
28 |
overall = "Yes"
|
29 |
return overall
|
30 |
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
if (llm):
|
33 |
prompt = f'{readme}\n \
|
34 |
Is this README file is enough to find what \
|
35 |
package dependencies you need to install and how to train \
|
36 |
+
and evaluate the proposed model?'
|
|
|
37 |
llm.predict("HELP", prompt)
|
38 |
|
39 |
|
evaluations/models.py
CHANGED
@@ -8,8 +8,23 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
|
|
8 |
"HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
|
9 |
Please help me answer the following question.
|
10 |
Keep your answers short, and informative.
|
11 |
-
Your answer should be a single paragraph.
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
class LocalLLM():
|
15 |
def __init__(self, model_name):
|
|
|
8 |
"HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
|
9 |
Please help me answer the following question.
|
10 |
Keep your answers short, and informative.
|
11 |
+
Your answer should be a single paragraph.""",
|
12 |
+
"PITFALL": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
|
13 |
+
You are looking for common pitfalls in the code. More specifically please consider the follwing pitfalls:
|
14 |
+
Please explain if you find any design-flaws with regards to the data collection in the code."))
|
15 |
+
Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments)."))
|
16 |
+
Please explain if you find any confounders in the code."))
|
17 |
+
Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)"))
|
18 |
+
Please explain if you find signs of historical biases in the data used."))
|
19 |
+
Please explain if you find signs of information leaking between the training and testing data."))
|
20 |
+
Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)"))
|
21 |
+
Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias)."))
|
22 |
+
Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)"))
|
23 |
+
Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)"))
|
24 |
+
Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)"))
|
25 |
+
Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)"))
|
26 |
+
Please explain if you find signs of faulty interpretations of the reported results.
|
27 |
+
If you don't find anything concerning, please return an empty string.""" }
|
28 |
|
29 |
class LocalLLM():
|
30 |
def __init__(self, model_name):
|
evaluations/pitfalls.py
CHANGED
@@ -9,40 +9,5 @@ def evaluate(verbose, llm, zip, readme):
|
|
9 |
# Pitfall #1
|
10 |
for code in codebase:
|
11 |
code = code[:1000]
|
12 |
-
|
13 |
-
|
14 |
-
# Pitfall #2
|
15 |
-
llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
|
16 |
-
|
17 |
-
# Pitfall #3
|
18 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
|
19 |
-
|
20 |
-
# Pitfall #4
|
21 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
|
22 |
-
|
23 |
-
# Pitfall #5
|
24 |
-
llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
|
25 |
-
|
26 |
-
# Pitfall #6
|
27 |
-
llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
|
28 |
-
|
29 |
-
# Pitfall #7
|
30 |
-
llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
|
31 |
-
|
32 |
-
# Pitfall #8
|
33 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
|
34 |
-
|
35 |
-
# Pitfall #9
|
36 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
|
37 |
-
|
38 |
-
# Pitfall #10
|
39 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
|
40 |
-
|
41 |
-
# Pitfall #11
|
42 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
|
43 |
-
|
44 |
-
# Pitfall #12
|
45 |
-
llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
|
46 |
-
|
47 |
-
# Pitfall #13
|
48 |
-
llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")
|
|
|
9 |
# Pitfall #1
|
10 |
for code in codebase:
|
11 |
code = code[:1000]
|
12 |
+
log(verbose, "LOG", code)
|
13 |
+
log(verbose, "LOG", llm.predict("PITFALL", f"{code}\n Can you find any signs of common pitfalls in this code?"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|