reproduce / evaluations /pitfalls.py
attilasimko's picture
let's see...
08a78c1
raw
history blame
2.34 kB
from .utils import log, fetch_code
import re
def evaluate(verbose, llm, zip, readme):
log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
code = fetch_code(zip)
if (llm):
# Pitfall #1
llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
# Pitfall #2
llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
# Pitfall #3
llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
# Pitfall #4
llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
# Pitfall #5
llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
# Pitfall #6
llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
# Pitfall #7
llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
# Pitfall #8
llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
# Pitfall #9
llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
# Pitfall #10
llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
# Pitfall #11
llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
# Pitfall #12
llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
# Pitfall #13
llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")