Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
08a78c1
1
Parent(s):
0dc48b3
let's see...
Browse files- evaluations/models.py +2 -1
- evaluations/pitfalls.py +43 -3
- evaluations/repo_evaluations.py +9 -13
- evaluations/utils.py +13 -1
evaluations/models.py
CHANGED
@@ -8,7 +8,8 @@ system_messages = { "STRICT": """You are a chatbot evaluating github repositorie
|
|
8 |
"HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
|
9 |
Please help me answer the following question.
|
10 |
Keep your answers short, and informative.
|
11 |
-
Your answer should be a single paragraph.
|
|
|
12 |
|
13 |
class LocalLLM():
|
14 |
def __init__(self, model_name):
|
|
|
8 |
"HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
|
9 |
Please help me answer the following question.
|
10 |
Keep your answers short, and informative.
|
11 |
+
Your answer should be a single paragraph.
|
12 |
+
If you can't find any issues with the code, return an empty string.""" }
|
13 |
|
14 |
class LocalLLM():
|
15 |
def __init__(self, model_name):
|
evaluations/pitfalls.py
CHANGED
@@ -1,6 +1,46 @@
|
|
1 |
-
from .utils import log,
|
2 |
import re
|
3 |
|
4 |
def evaluate(verbose, llm, zip, readme):
|
5 |
-
log(verbose, "TITLE", "\nLooking for common pitfalls...")
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .utils import log, fetch_code
|
2 |
import re
|
3 |
|
4 |
def evaluate(verbose, llm, zip, readme):
|
5 |
+
log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
|
6 |
+
code = fetch_code(zip)
|
7 |
+
|
8 |
+
if (llm):
|
9 |
+
# Pitfall #1
|
10 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any design-flaws with regards to the data collection in the code.")
|
11 |
+
|
12 |
+
# Pitfall #2
|
13 |
+
llm.predict("HELP", f"{code}\n Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments).")
|
14 |
+
|
15 |
+
# Pitfall #3
|
16 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any confounders in the code.")
|
17 |
+
|
18 |
+
# Pitfall #4
|
19 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)")
|
20 |
+
|
21 |
+
# Pitfall #5
|
22 |
+
llm.predict("HELP", f"{code}\n Please explain if you find signs of historical biases in the data used.")
|
23 |
+
|
24 |
+
# Pitfall #6
|
25 |
+
llm.predict("HELP", f"{code}\n Please explain if you find signs of information leaking between the training and testing data.")
|
26 |
+
|
27 |
+
# Pitfall #7
|
28 |
+
llm.predict("HELP", f"{code}\n Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)")
|
29 |
+
|
30 |
+
# Pitfall #8
|
31 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias).")
|
32 |
+
|
33 |
+
# Pitfall #9
|
34 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)")
|
35 |
+
|
36 |
+
# Pitfall #10
|
37 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)")
|
38 |
+
|
39 |
+
# Pitfall #11
|
40 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)")
|
41 |
+
|
42 |
+
# Pitfall #12
|
43 |
+
llm.predict("HELP", f"{code}\n Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)")
|
44 |
+
|
45 |
+
# Pitfall #13
|
46 |
+
llm.predict("HELP", f"{code}\n Please explain if you find signs of faulty interpretations of the reported results.")
|
evaluations/repo_evaluations.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
-
from evaluations import documentation, requirements, training, validating, license, weights
|
4 |
from evaluations.utils import *
|
5 |
import zipfile
|
6 |
import os
|
@@ -52,18 +52,14 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
|
|
52 |
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
53 |
if (len(zip.namelist()) <= 2):
|
54 |
log(verbose, "LOG", "The repository is empty.")
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
|
64 |
-
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
|
65 |
-
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
|
66 |
-
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
67 |
|
68 |
return results
|
69 |
except Exception as e:
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
+
from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
|
4 |
from evaluations.utils import *
|
5 |
import zipfile
|
6 |
import os
|
|
|
52 |
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
53 |
if (len(zip.namelist()) <= 2):
|
54 |
log(verbose, "LOG", "The repository is empty.")
|
55 |
+
|
56 |
+
results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
|
57 |
+
results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
|
58 |
+
results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
|
59 |
+
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
|
60 |
+
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
|
61 |
+
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
62 |
+
pitfalls.evaluate(verbose, llm, zip, readme)
|
|
|
|
|
|
|
|
|
63 |
|
64 |
return results
|
65 |
except Exception as e:
|
evaluations/utils.py
CHANGED
@@ -2,10 +2,22 @@ import time
|
|
2 |
import requests
|
3 |
import time
|
4 |
import os
|
|
|
5 |
import json
|
6 |
import streamlit as st
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def get_api_link(url):
|
11 |
username, repo_name = decompose_url(url)
|
|
|
2 |
import requests
|
3 |
import time
|
4 |
import os
|
5 |
+
import zipfile
|
6 |
import json
|
7 |
import streamlit as st
|
8 |
|
9 |
+
def fetch_code(path):
|
10 |
+
zip_content_dict = {}
|
11 |
+
with zipfile.ZipFile(path, 'r') as zip_ref:
|
12 |
+
for file_name in zip_ref.namelist():
|
13 |
+
if ((file_name.lower().endswith(".py") | (file_name.lower().endswith(".ipynb")) | (file_name.lower().endswith(".md")) | (file_name.lower().endswith(".txt")))):
|
14 |
+
with zip_ref.open(file_name) as file:
|
15 |
+
file_content = file.read().decode('utf-8')
|
16 |
+
zip_content_dict[file_name] = file_content
|
17 |
+
with zip_ref.open(file_name) as file:
|
18 |
+
file_content = file.read().decode('utf-8')
|
19 |
+
zip_content_dict[file_name] = file_content
|
20 |
+
return zip_content_dict
|
21 |
|
22 |
def get_api_link(url):
|
23 |
username, repo_name = decompose_url(url)
|