Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

App Files Files Community

attilasimko commited on Oct 3, 2024

Commit

6fb0159

1 Parent(s): 1373b5f

go

Browse files

Files changed (2) hide show

evaluations/pitfalls.py +28 -27
evaluations/utils.py +1 -1

evaluations/pitfalls.py CHANGED Viewed

@@ -3,44 +3,45 @@ import re
 def evaluate(verbose, llm, zip, readme):
   log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
-  code = fetch_code(zip)
   if (llm):
     # Pitfall #1
-    llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
-    # Pitfall #2
-    llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
-    # Pitfall #3
-    llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
-    # Pitfall #4
-    llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
-    # Pitfall #5
-    llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
-    # Pitfall #6
-    llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
-    # Pitfall #7
-    llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
-    # Pitfall #8
-    llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
-    # Pitfall #9
-    llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
-    # Pitfall #10
-    llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
-    # Pitfall #11
-    llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
-    # Pitfall #12
-    llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
-    # Pitfall #13
-    llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")

 def evaluate(verbose, llm, zip, readme):
   log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
+  codebase = fetch_code(zip)
   if (llm):
     # Pitfall #1
+    for code in codebase:
+      llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
+      # Pitfall #2
+      llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
+      # Pitfall #3
+      llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
+      # Pitfall #4
+      llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
+      # Pitfall #5
+      llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
+      # Pitfall #6
+      llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
+      # Pitfall #7
+      llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
+      # Pitfall #8
+      llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
+      # Pitfall #9
+      llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
+      # Pitfall #10
+      llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
+      # Pitfall #11
+      llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
+      # Pitfall #12
+      llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
+      # Pitfall #13
+      llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")

evaluations/utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ import streamlit as st
 def fetch_code(zip_file):
     zip_content_dict = {}
     for file_name in zip_file.namelist():
-        if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")) | (file_name.lower().endswith(".txt")))):
             file_content = zip_file.open(file_name).read().decode("utf-8")
             zip_content_dict[file_name] = file_content
     return zip_content_dict

 def fetch_code(zip_file):
     zip_content_dict = {}
     for file_name in zip_file.namelist():
+        if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")))):
             file_content = zip_file.open(file_name).read().decode("utf-8")
             zip_content_dict[file_name] = file_content
     return zip_content_dict