attilasimko commited on
Commit
3abd747
·
1 Parent(s): 67b2553

Let's see some logs

Browse files
app.py CHANGED
@@ -3,7 +3,7 @@ from evaluations.repo_evaluations import evaluate
3
  from evaluations.models import RemoteLLM
4
  import requests
5
 
6
- model = RemoteLLM("meta-llama/Llama-3.1-405B-Instruct")
7
 
8
  st.write("\n")
9
  st.write("Welcome to the online reproducibility evaluation tool!")
 
3
  from evaluations.models import RemoteLLM
4
  import requests
5
 
6
+ model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
7
 
8
  st.write("\n")
9
  st.write("Welcome to the online reproducibility evaluation tool!")
evaluations/documentation.py CHANGED
@@ -15,6 +15,12 @@ def evaluate(verbose, llm, zip, readme):
15
  non_empty_rows = [row for row in readme.split("\n") if row != ""]
16
  if (len(non_empty_rows) < 5):
17
  log(verbose, "ERROR", "Readme file has very few lines")
 
 
 
 
 
 
18
  return overall
19
 
20
  if (count_code_lines(non_empty_rows) > 5):
@@ -22,18 +28,12 @@ def evaluate(verbose, llm, zip, readme):
22
  overall = "Yes"
23
  return overall
24
 
25
- if (llm):
26
- code = fetch_code(zip)
27
- if (llm):
28
- summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
29
- log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
30
 
31
  if (llm):
32
  prompt = f'{readme}\n \
33
  Is this README file is enough to find what \
34
  package dependencies you need to install and how to train \
35
- and evaluate the proposed model? Please strictly \
36
- answer yes or no.\n\nA:'
37
  llm.predict("HELP", prompt)
38
 
39
 
 
15
  non_empty_rows = [row for row in readme.split("\n") if row != ""]
16
  if (len(non_empty_rows) < 5):
17
  log(verbose, "ERROR", "Readme file has very few lines")
18
+
19
+ if (llm):
20
+ code = fetch_code(zip)
21
+ if (llm):
22
+ summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
23
+ log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
24
  return overall
25
 
26
  if (count_code_lines(non_empty_rows) > 5):
 
28
  overall = "Yes"
29
  return overall
30
 
 
 
 
 
 
31
 
32
  if (llm):
33
  prompt = f'{readme}\n \
34
  Is this README file is enough to find what \
35
  package dependencies you need to install and how to train \
36
+ and evaluate the proposed model?'
 
37
  llm.predict("HELP", prompt)
38
 
39
 
evaluations/models.py CHANGED
@@ -8,8 +8,23 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
8
  "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
9
  Please help me answer the following question.
10
  Keep your answers short, and informative.
11
- Your answer should be a single paragraph.
12
- If you can't find any issues with the code, return an empty string.""" }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class LocalLLM():
15
  def __init__(self, model_name):
 
8
  "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
9
  Please help me answer the following question.
10
  Keep your answers short, and informative.
11
+ Your answer should be a single paragraph.""",
12
+ "PITFALL": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
13
+ You are looking for common pitfalls in the code. More specifically please consider the follwing pitfalls:
14
+ Please explain if you find any design-flaws with regards to the data collection in the code."))
15
+ Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments)."))
16
+ Please explain if you find any confounders in the code."))
17
+ Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)"))
18
+ Please explain if you find signs of historical biases in the data used."))
19
+ Please explain if you find signs of information leaking between the training and testing data."))
20
+ Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)"))
21
+ Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias)."))
22
+ Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)"))
23
+ Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)"))
24
+ Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)"))
25
+ Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)"))
26
+ Please explain if you find signs of faulty interpretations of the reported results.
27
+ If you don't find anything concerning, please return an empty string.""" }
28
 
29
  class LocalLLM():
30
  def __init__(self, model_name):
evaluations/pitfalls.py CHANGED
@@ -9,40 +9,5 @@ def evaluate(verbose, llm, zip, readme):
9
  # Pitfall #1
10
  for code in codebase:
11
  code = code[:1000]
12
- llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
13
-
14
- # Pitfall #2
15
- llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
16
-
17
- # Pitfall #3
18
- llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
19
-
20
- # Pitfall #4
21
- llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
22
-
23
- # Pitfall #5
24
- llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
25
-
26
- # Pitfall #6
27
- llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
28
-
29
- # Pitfall #7
30
- llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
31
-
32
- # Pitfall #8
33
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
34
-
35
- # Pitfall #9
36
- llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
37
-
38
- # Pitfall #10
39
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
40
-
41
- # Pitfall #11
42
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
43
-
44
- # Pitfall #12
45
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
46
-
47
- # Pitfall #13
48
- llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")
 
9
  # Pitfall #1
10
  for code in codebase:
11
  code = code[:1000]
12
+ log(verbose, "LOG", code)
13
+ log(verbose, "LOG", llm.predict("PITFALL", f"{code}\n Can you find any signs of common pitfalls in this code?"))