from .utils import fetch_code import re import numpy as np from core.conversion import noop_logger def is_applicable(llm, readme, log_fn=noop_logger): res_training = "NA" res_evaluation = "NA" res_weights = "NA" if (llm): log_fn("TITLE", "\nChecking what parts of the evaluations are applicable...") res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?") res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?") res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?") applicable = f"{res_training}/{res_evaluation}/{res_weights}" return applicable def evaluate(llm, zip, readmes, log_fn=noop_logger): log_fn("TITLE", "\nEvaluating code documentation...") overall = "No" code_to_comment_ratio = get_code_to_comment_ratio(zip) log_fn("LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.") result = { "dependencies": "No", "training": "No", "evaluation": "No", "weights": "No", "scripts": "No" } for readme in readmes: non_empty_rows = [row for row in readme.split("\n") if row != ""] if (len(non_empty_rows) < 5): log_fn("ERROR", "Readme file has very few lines") if (llm): code = fetch_code(zip) if (llm): summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.") log_fn("LOG", f"Based on the code, your readme file could be something like...\n{summary}") return overall if (count_code_lines(non_empty_rows) > 2): log_fn("LOG", "Readme file contains python examples.") result["scripts"] = "Yes" if (llm): prompt = f'{readme}\n \ Is this README file is enough to find what \ package dependencies you need to install and how to train \ and evaluate the proposed model?' llm.predict("HELP", prompt) if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \ (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \ (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)): log_fn("ERROR", "Readme file missing information about package dependencies") else: result["dependencies"] = "Yes" if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)): log_fn("ERROR", "Readme file missing training information") else: result["training"] = "Yes" if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)): log_fn("ERROR", "Readme file missing testing information") else: result["evaluating"] = "Yes" if ((len(re.findall("example", readme, re.IGNORECASE)) == 0)): log_fn("LOG", "Readme file contains no links to examples") else: result["evaluating"] = "Yes" score = np.sum(np.array(list(result.values()), dtype=str) == "Yes") return "Yes" if score >= 2 else "No" def count_comment_lines(lines): # Initialize counters single_line_comments = 0 multi_line_comments = 0 in_multiline_comment = False for line in lines: stripped_line = line.strip() # Check for single-line comments if stripped_line.startswith('#'): single_line_comments += 1 # Check for multi-line comment (docstring) start or end if stripped_line.startswith('"""') or stripped_line.startswith("'''"): if not in_multiline_comment: # Starting a new multi-line comment in_multiline_comment = True multi_line_comments += 1 # Count the start line itself else: # Ending an existing multi-line comment in_multiline_comment = False multi_line_comments += 1 # Count the end line itself elif in_multiline_comment: # Continue counting lines within a multi-line comment multi_line_comments += 1 return single_line_comments, multi_line_comments def get_code_to_comment_ratio(zip): python_files = [file_path for file_path in zip.namelist() if (file_path.endswith(".py") | file_path.endswith(".ipynb"))] code_line_count = 1 comment_line_count = 0 for file in python_files: file_lines = zip.open(file).read().decode("utf-8").split('\n') sl_comm, ml_comm = count_comment_lines(file_lines) comment_line_count += sl_comm + ml_comm code_line_count += len(file_lines) - (sl_comm + ml_comm) code_to_comment_ratio = 100 * comment_line_count / code_line_count return code_to_comment_ratio def count_code_lines(lines): is_code_snippet = False code_line_count = 0 for line in lines: stripped_line = line.strip() if stripped_line.startswith('```'): if not is_code_snippet: is_code_snippet = True code_line_count += 1 else: is_code_snippet = False code_line_count += 1 elif is_code_snippet: code_line_count += 1 return int(code_line_count / 2)