reproduce / evaluations /models.py
attilasimko's picture
Let's see some logs
3abd747
raw
history blame
3.82 kB
from transformers import pipeline
from huggingface_hub import InferenceClient
import os
system_messages = { "STRICT": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
Strictly answer the questions with "Yes" or "No".
Don't use any punctuation either.""",
"HELP": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
Please help me answer the following question.
Keep your answers short, and informative.
Your answer should be a single paragraph.""",
"PITFALL": """You are a chatbot evaluating github repositories, their python codes and corresponding readme files.
You are looking for common pitfalls in the code. More specifically please consider the follwing pitfalls:
Please explain if you find any design-flaws with regards to the data collection in the code."))
Please explain if you find signs of dataset shift in the code (e.g. sampling bias, imbalanced populations, imbalanced labels, non-stationary environments)."))
Please explain if you find any confounders in the code."))
Please explain if you find any measurement errors in the code (labelling mistakes, noisy measurements, inappropriate proxies)"))
Please explain if you find signs of historical biases in the data used."))
Please explain if you find signs of information leaking between the training and testing data."))
Please explain if you find a model-problem mismatch (e.g. over-complicated/simplistic model, computational challenges)"))
Please explain if you find any signs of overfitting in the code (e.g. high variance, high complexity, low bias)."))
Please explain if you find any misused metrics in the code (e.g. poor metric selection, poor implementations)"))
Please explain if you find any signs of black box models in the code (e.g. lack of interpretability, lack of transparency)"))
Please explain if you find any signs of baseline comparison issues in the code (e.g. if the testing data does not fit the training data)"))
Please explain if you find any signs of insufficient reporting in the code (e.g. missing hyperparameters, missing evaluation metrics)"))
Please explain if you find signs of faulty interpretations of the reported results.
If you don't find anything concerning, please return an empty string.""" }
class LocalLLM():
def __init__(self, model_name):
self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000, device_map={0: 0})
def predict(self, response_type, prompt):
messages = [
{"role": "system", "content": system_messages[response_type]},
{"role": "user", "content": prompt},
]
res = self.pipe(messages)
res = res[0]["generated_text"]
res = [response for response in res if response["role"] == "assistant"][0]["content"]
res = res.strip()
return res
class RemoteLLM():
def __init__(self, model_name):
token = os.getenv("hfToken")
self.model_name = model_name
self.client = InferenceClient(api_key=token)
def predict(self, response_type, prompt):
message = self.client.chat_completion(
model=self.model_name, max_tokens=500, stream=False,
messages=[{"role": "system", "content": system_messages[response_type]},
{"role": "user", "content": prompt}])
return message['choices'][0]['message']['content']