Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

App Files Files Community

attilasimko commited on Oct 3, 2024

Commit

08a78c1

1 Parent(s): 0dc48b3

let's see...

Browse files

Files changed (4) hide show

evaluations/models.py +2 -1
evaluations/pitfalls.py +43 -3
evaluations/repo_evaluations.py +9 -13
evaluations/utils.py +13 -1

evaluations/models.py CHANGED Viewed

@@ -8,7 +8,8 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
                     "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
                     Please help me answer the following question.
                     Keep your answers short, and informative.
-                    Your answer should be a single paragraph.""" }
 class LocalLLM():
   def __init__(self, model_name):

                     "HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
                     Please help me answer the following question.
                     Keep your answers short, and informative.
+                    Your answer should be a single paragraph.
+                    If you can't find any issues with the code, return an empty string.""" }
 class LocalLLM():
   def __init__(self, model_name):

evaluations/pitfalls.py CHANGED Viewed

@@ -1,6 +1,46 @@
-from .utils import log, model_predict
 import re
 def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for common pitfalls...")

+from .utils import log, fetch_code
 import re
 def evaluate(verbose, llm, zip, readme):
+  log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
+  code = fetch_code(zip)
+  if (llm):
+    # Pitfall #1
+    llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
+    # Pitfall #2
+    llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
+    # Pitfall #3
+    llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
+    # Pitfall #4
+    llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
+    # Pitfall #5
+    llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
+    # Pitfall #6
+    llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
+    # Pitfall #7
+    llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
+    # Pitfall #8
+    llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
+    # Pitfall #9
+    llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
+    # Pitfall #10
+    llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
+    # Pitfall #11
+    llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
+    # Pitfall #12
+    llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
+    # Pitfall #13
+    llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")

evaluations/repo_evaluations.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
 import os
-from evaluations import documentation, requirements, training, validating, license, weights
 from evaluations.utils import *
 import zipfile
 import os
@@ -52,18 +52,14 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
       results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
       if (len(zip.namelist()) <= 2):
           log(verbose, "LOG", "The repository is empty.")
-          results["pred_live"] = "No"
-          results["pred_training"] = "No"
-          results["pred_evaluation"] = "No"
-          results["pred_weights"] = "No"
-          results["pred_packages"] = "No"
-      else:
-          results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
-          results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
-          results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
-          results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
-          results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
-          results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
       return results
   except Exception as e:

 import pandas as pd
 import os
+from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
 from evaluations.utils import *
 import zipfile
 import os
       results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
       if (len(zip.namelist()) <= 2):
           log(verbose, "LOG", "The repository is empty.")
+      results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
+      results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
+      results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
+      results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
+      results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
+      results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
+      pitfalls.evaluate(verbose, llm, zip, readme)
       return results
   except Exception as e:

evaluations/utils.py CHANGED Viewed

@@ -2,10 +2,22 @@ import time
 import requests
 import time
 import os
 import json
 import streamlit as st
 def get_api_link(url):
     username, repo_name = decompose_url(url)

 import requests
 import time
 import os
+import zipfile
 import json
 import streamlit as st
+def fetch_code(path):
+    zip_content_dict = {}
+    with zipfile.ZipFile(path, 'r') as zip_ref:
+        for file_name in zip_ref.namelist():
+            if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")) | (file_name.lower().endswith(".txt")))):
+                with zip_ref.open(file_name) as file:
+                    file_content = file.read().decode('utf-8')
+                zip_content_dict[file_name] = file_content
+            with zip_ref.open(file_name) as file:
+                file_content = file.read().decode('utf-8')
+            zip_content_dict[file_name] = file_content
+    return zip_content_dict
 def get_api_link(url):
     username, repo_name = decompose_url(url)