attilasimko commited on
Commit
6fb0159
·
1 Parent(s): 1373b5f
Files changed (2) hide show
  1. evaluations/pitfalls.py +28 -27
  2. evaluations/utils.py +1 -1
evaluations/pitfalls.py CHANGED
@@ -3,44 +3,45 @@ import re
3
 
4
  def evaluate(verbose, llm, zip, readme):
5
  log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
6
- code = fetch_code(zip)
7
 
8
  if (llm):
9
  # Pitfall #1
10
- llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
 
11
 
12
- # Pitfall #2
13
- llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
14
 
15
- # Pitfall #3
16
- llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
17
 
18
- # Pitfall #4
19
- llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
20
 
21
- # Pitfall #5
22
- llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
23
 
24
- # Pitfall #6
25
- llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
26
 
27
- # Pitfall #7
28
- llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
29
 
30
- # Pitfall #8
31
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
32
 
33
- # Pitfall #9
34
- llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
35
 
36
- # Pitfall #10
37
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
38
 
39
- # Pitfall #11
40
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
41
-
42
- # Pitfall #12
43
- llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
44
 
45
- # Pitfall #13
46
- llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")
 
3
 
4
  def evaluate(verbose, llm, zip, readme):
5
  log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
6
+ codebase = fetch_code(zip)
7
 
8
  if (llm):
9
  # Pitfall #1
10
+ for code in codebase:
11
+ llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
12
 
13
+ # Pitfall #2
14
+ llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
15
 
16
+ # Pitfall #3
17
+ llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
18
 
19
+ # Pitfall #4
20
+ llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
21
 
22
+ # Pitfall #5
23
+ llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
24
 
25
+ # Pitfall #6
26
+ llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
27
 
28
+ # Pitfall #7
29
+ llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
30
 
31
+ # Pitfall #8
32
+ llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
33
 
34
+ # Pitfall #9
35
+ llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
36
 
37
+ # Pitfall #10
38
+ llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
39
 
40
+ # Pitfall #11
41
+ llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
42
+
43
+ # Pitfall #12
44
+ llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
45
 
46
+ # Pitfall #13
47
+ llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")
evaluations/utils.py CHANGED
@@ -9,7 +9,7 @@ import streamlit as st
9
  def fetch_code(zip_file):
10
  zip_content_dict = {}
11
  for file_name in zip_file.namelist():
12
- if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")) | (file_name.lower().endswith(".txt")))):
13
  file_content = zip_file.open(file_name).read().decode("utf-8")
14
  zip_content_dict[file_name] = file_content
15
  return zip_content_dict
 
9
  def fetch_code(zip_file):
10
  zip_content_dict = {}
11
  for file_name in zip_file.namelist():
12
+ if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")))):
13
  file_content = zip_file.open(file_name).read().decode("utf-8")
14
  zip_content_dict[file_name] = file_content
15
  return zip_content_dict