Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

App Files Files Community

reproduce / evaluations /pitfalls.py

attilasimko

let's see...

08a78c1 9 months ago

raw

history blame

2.34 kB

	from .utils import log, fetch_code
	import re

	def evaluate(verbose, llm, zip, readme):
	log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
	code = fetch_code(zip)

	if (llm):
	# Pitfall #1
	llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")

	# Pitfall #2
	llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")

	# Pitfall #3
	llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")

	# Pitfall #4
	llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")

	# Pitfall #5
	llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")

	# Pitfall #6
	llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")

	# Pitfall #7
	llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")

	# Pitfall #8
	llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")

	# Pitfall #9
	llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")

	# Pitfall #10
	llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")

	# Pitfall #11
	llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")

	# Pitfall #12
	llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")

	# Pitfall #13
	llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")