Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

App Files Files Community

attilasimko commited on Oct 3, 2024

Commit

3abd747

1 Parent(s): 67b2553

Let's see some logs

Browse files

Files changed (4) hide show

app.py +1 -1
evaluations/documentation.py +7 -7
evaluations/models.py +17 -2
evaluations/pitfalls.py +2 -37

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from evaluations.repo_evaluations import evaluate
 from evaluations.models import RemoteLLM
 import requests
-model = RemoteLLM("meta-llama/Llama-3.1-405B-Instruct")
 st.write("\n")
 st.write("Welcome to the online reproducibility evaluation tool!")

 from evaluations.models import RemoteLLM
 import requests
+model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
 st.write("\n")
 st.write("Welcome to the online reproducibility evaluation tool!")

evaluations/documentation.py CHANGED Viewed

@@ -15,6 +15,12 @@ def evaluate(verbose, llm, zip, readme):
       non_empty_rows = [row for row in readme.split("\n") if row != ""]
       if (len(non_empty_rows) < 5):
           log(verbose, "ERROR", "Readme file has very few lines")
           return overall
       if (count_code_lines(non_empty_rows) > 5):
@@ -22,18 +28,12 @@ def evaluate(verbose, llm, zip, readme):
           overall = "Yes"
           return overall
-      if (llm):
-          code = fetch_code(zip)
-          if (llm):
-            summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
-            log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
       if (llm):
           prompt = f'{readme}\n \
           Is this README file is enough to find what \
           package dependencies you need to install and how to train \
-          and evaluate the proposed model? Please strictly \
-          answer yes or no.\n\nA:'
           llm.predict("HELP", prompt)

       non_empty_rows = [row for row in readme.split("\n") if row != ""]
       if (len(non_empty_rows) < 5):
           log(verbose, "ERROR", "Readme file has very few lines")
+          if (llm):
+              code = fetch_code(zip)
+              if (llm):
+                  summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
+                  log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
           return overall
       if (count_code_lines(non_empty_rows) > 5):
           overall = "Yes"
           return overall
       if (llm):
           prompt = f'{readme}\n \
           Is this README file is enough to find what \
           package dependencies you need to install and how to train \
+          and evaluate the proposed model?'
           llm.predict("HELP", prompt)

evaluations/models.py CHANGED Viewed

@@ -8,8 +8,23 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
                     "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
                     Please help me answer the following question.
                     Keep your answers short, and informative.
-                    Your answer should be a single paragraph.
-                    If you can't find any issues with the code, return an empty string.""" }
 class LocalLLM():
   def __init__(self, model_name):

                     "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
                     Please help me answer the following question.
                     Keep your answers short, and informative.
+                    Your answer should be a single paragraph.""",
+                     "PITFALL": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
+                     You are looking for common pitfalls in the code. More specifically please consider the follwing pitfalls:
+                      Please explain if you find any design-flaws with regards to the data collection in the code."))
+                      Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments)."))
+                      Please explain if you find any confounders in the code."))
+                      Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)"))
+                      Please explain if you find signs of historical biases in the data used."))
+                      Please explain if you find signs of information leaking between the training and testing data."))
+                      Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)"))
+                      Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias)."))
+                      Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)"))
+                      Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)"))
+                      Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)"))
+                      Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)"))
+                      Please explain if you find signs of faulty interpretations of the reported results.
+                      If you don't find anything concerning, please return an empty string.""" }
 class LocalLLM():
   def __init__(self, model_name):

evaluations/pitfalls.py CHANGED Viewed

@@ -9,40 +9,5 @@ def evaluate(verbose, llm, zip, readme):
     # Pitfall #1
     for code in codebase:
       code = code[:1000]
-      llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
-      # Pitfall #2
-      llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
-      # Pitfall #3
-      llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
-      # Pitfall #4
-      llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
-      # Pitfall #5
-      llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
-      # Pitfall #6
-      llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
-      # Pitfall #7
-      llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
-      # Pitfall #8
-      llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
-      # Pitfall #9
-      llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
-      # Pitfall #10
-      llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
-      # Pitfall #11
-      llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
-      # Pitfall #12
-      llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
-      # Pitfall #13
-      llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")

     # Pitfall #1
     for code in codebase:
       code = code[:1000]
+      log(verbose, "LOG", code)
+      log(verbose, "LOG", llm.predict("PITFALL", f"{code}\n Can you find any signs of common pitfalls in this code?"))