Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

App Files Files Community

Attila Simkó commited on 1 day ago

Commit

2db37b1

1 Parent(s): 3cfadc8

big upgrade

Browse files

Files changed (35) hide show

.gitignore +1 -7
app.py +7 -3
config/__pycache__/constants.cpython-312.pyc +0 -0
config/constants.py +12 -0
core/__pycache__/conversion.cpython-312.pyc +0 -0
core/__pycache__/paper.cpython-312.pyc +0 -0
core/conversion.py +205 -0
core/paper.py +160 -0
data/dump.csv +0 -0
data/fetch_miccai.py +0 -60
data/fetch_processed.py +0 -31
data/fetch_zips.py +0 -43
data/zipfiles.csv +0 -0
data_generation/fetch_processed.py +123 -0
{data → data_generation/paper_scraping}/fetch_arxiv.py +63 -49
data_generation/paper_scraping/fetch_miccai.py +89 -0
{data → data_generation/paper_scraping}/fetch_nature.py +18 -18
evaluations/documentation.py +33 -31
evaluations/license.py +12 -12
evaluations/pitfalls.py +6 -5
evaluations/repo_evaluations.py +52 -65
evaluations/requirements.py +15 -13
evaluations/training.py +10 -10
evaluations/url.py +86 -0
evaluations/utils.py +13 -84
evaluations/validating.py +10 -9
evaluations/weights.py +44 -43
full_eval.py +18 -3
midl_summary.py +0 -57
plotting/midl_summary.py +59 -0
plotting/paper_plots.py +26 -48
plotting/print_incorrect.py +44 -0
plotting/result_plots.py +123 -70
plotting/results.ipynb +241 -0
plotting/urls.py +37 -0

.gitignore CHANGED Viewed

@@ -1,10 +1,4 @@
-data/MIDL.csv
-data/MICCAI.csv
-data/arXiv.csv
-data/Nature.csv
-data/results.csv
-data/*.zip
-data/test/*
 *.env
 .env
 evaluations/__pycache__/*

+data/
 *.env
 .env
 evaluations/__pycache__/*

app.py CHANGED Viewed

@@ -2,8 +2,11 @@ import streamlit as st
 from evaluations.repo_evaluations import evaluate
 from evaluations.models import RemoteLLM
 import requests
-model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
 st.write("\n")
 st.write("Welcome to the online reproducibility evaluation tool!")
@@ -12,10 +15,11 @@ st.write("Additionally we look for common pitfalls in the code according to a pu
 checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
 repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
 if (repo_link):
     verbose = 4 if checkbox else 3
-    evaluate(llm=model, verbose=verbose, repo_url=repo_link)
 with st.form("my_form"):
     st.write("Notice something wrong? Please tell us so we can improve.")

 from evaluations.repo_evaluations import evaluate
 from evaluations.models import RemoteLLM
 import requests
+from core.paper import Paper
+from core.conversion import fetch_repo
+import os
+model = None # RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
 st.write("\n")
 st.write("Welcome to the online reproducibility evaluation tool!")
 checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
 repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
 if (repo_link):
     verbose = 4 if checkbox else 3
+    paper = Paper.from_url(repo_link, verbose=verbose)
+    fetch_repo(0, paper.main_repo_url, paper.zip_path, os.getenv("githubToken"))
+    evaluate(model, paper, paper.log)
 with st.form("my_form"):
     st.write("Notice something wrong? Please tell us so we can improve.")

config/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (684 Bytes). View file

config/constants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from enum import Enum, auto
+VENUE_ORDER = ["MICCAI", "MIDL", "Nature", "arXiv"]
+MIDL_COLORS = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000", "#FF862C", "#800080"]
+class LogType(Enum):
+    TITLE = "TITLE"
+    LOG = "LOG"
+    ERROR = "ERROR"
+    WARNING = "WARNING"
+    NOTE = "NOTE"
+    INFO = "INFO"

core/__pycache__/conversion.cpython-312.pyc ADDED Viewed

Binary file (10.1 kB). View file

core/__pycache__/paper.cpython-312.pyc ADDED Viewed

Binary file (8.64 kB). View file

core/conversion.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import os
+from urllib.request import urlretrieve
+import requests
+import random
+import time
+import pandas as pd
+import xml.etree.ElementTree as ET
+from tqdm import tqdm
+import os
+import requests
+from tqdm import tqdm
+import xml.etree.ElementTree as ET
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import numpy as np
+import faiss
+token = os.getenv("githubToken")
+def noop_logger(*args, **kwargs):
+    pass
+def download_pdf(paper, max_retries=3):
+    if pd.isna(paper.pdf_url):
+        paper.log("ERROR", "Missing PDF URL")
+        return paper
+    pdf_path = paper.pdf_path
+    if (os.path.exists(pdf_path)):
+        return paper
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(paper.pdf_url, headers=headers)
+            if response.status_code == 200:
+                with open(pdf_path, "wb") as f:
+                    f.write(response.content)
+                time.sleep(random.uniform(1.0, 3.0))
+                return paper
+            elif response.status_code == 429:
+                wait = 2 ** attempt
+                paper.log("WARNING", f"Rate limited, retrying in {wait}s...")
+                time.sleep(wait)
+            else:
+                paper.log("ERROR", f"Download failed: HTTP {response.status_code}")
+                break
+        except Exception as e:
+            paper.log("ERROR", f"Download error: {e}")
+            time.sleep(1)
+    return paper
+def get_api_link(url):
+    username, repo_name = decompose_url(url)
+    if (username == None):
+        return ""
+    return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
+def decompose_url(url):
+    try:
+        url = url.split("github.com")[1]
+        url = url.strip(".")
+        url = url.split(".git")[0]
+        url = url.strip("/")
+        parts = url.split("/")
+        username = parts[0]
+        repo_name = parts[1]
+        return username, repo_name
+    except:
+        return None, None
+def fetch_repo(verbose, repo_url, repo_name, token, force_download=False):
+    if (os.path.exists(repo_name)):
+        if (force_download):
+            os.remove(repo_name)
+        else:
+            return
+    if ("github.com" not in repo_url):
+        return ValueError(f"URL not for github repo, please evaluate manually ({repo_url}).")
+    headers = {"Authorization": f"token {token}"}
+    api_url = get_api_link(repo_url)
+    if (api_url == ""):
+        return ValueError(f"Failed to parse the URL, please evaluate manually ({repo_url}).")
+    # Sending GET request to GitHub API
+    response = requests.get(api_url, headers=headers)
+    if response.status_code == 200:
+        with open(repo_name, 'wb') as file:
+            file.write(response.content)
+    if (response.status_code == 404):
+        return ValueError("Repository private / Link broken.")
+def download_repo(paper):
+    try:
+        if (paper.main_repo_url is None):
+            return
+        fetch_repo(0, paper.main_repo_url, paper.zip_path, token)
+    except Exception as e:
+        paper.log("ERROR", f"Repo download failed: {e}")
+    return paper
+def pdf_to_grobid(filename, save_path=None, grobid_url="https://attilasimko-grobid.hf.space/"):
+    """
+    Convert a PDF to Grobid XML.
+    Parameters:
+    filename (str or list): Path to the PDF file or list of PDF files.
+    save_path (str, optional): Directory or file path to save to. Defaults to the current directory.
+    grobid_url (str, optional): URL of the Grobid server. Defaults to public server.
+    Returns:
+    str or list: Path(s) to the saved XML file(s) or parsed XML object if saved to a temp file.
+    """
+    def is_server_up(url):
+        try:
+            response = requests.get(url + "/api/health", timeout=5)
+            return response.status_code == 200
+        except requests.RequestException:
+            return False
+    if not is_server_up(grobid_url):
+        raise ConnectionError(f"The Grobid server {grobid_url} is not available.")
+    # Handle multiple files
+    if isinstance(filename, list):
+        if save_path is None or not os.path.isdir(save_path):
+            print(f"Warning: {save_path} is not a directory. PDFs will be saved in the current directory: {os.getcwd()}")
+            save_path = "."
+        xmls = []
+        for pdf in tqdm(filename, desc="Processing PDFs"):
+            try:
+                xml = pdf_to_grobid(pdf, save_path, grobid_url)
+                xmls.append(xml)
+            except Exception as e:
+                print(f"Error processing {pdf}: {e}")
+                xmls.append(None)
+        return xmls
+    # Handle directory input
+    if os.path.isdir(filename):
+        pdfs = [os.path.join(filename, f) for f in os.listdir(filename) if f.endswith(".pdf")]
+        if not pdfs:
+            print(f"Warning: No PDF files found in directory {filename}")
+        return pdf_to_grobid(pdfs, save_path, grobid_url)
+    # Ensure file exists
+    if not os.path.isfile(filename):
+        raise FileNotFoundError(f"The file {filename} does not exist.")
+    # Send PDF to Grobid
+    with open(filename, "rb") as file:
+        files = {"input": file}
+        post_url = f"{grobid_url}/api/processFulltextDocument"
+        response = requests.post(post_url, files=files)
+    if response.status_code != 200:
+        raise Exception(f"Error: {response.reason}")
+    # Determine save path
+    if save_path is None:
+        save_file = os.path.join(os.getcwd(), "temp_grobid.xml")
+    elif os.path.isdir(save_path):
+        base_name = os.path.splitext(os.path.basename(filename))[0] + ".xml"
+        save_file = os.path.join(save_path, base_name)
+    else:
+        save_file = save_path if save_path.endswith(".xml") else save_path + ".xml"
+    # Save the response
+    with open(save_file, "wb") as f:
+        f.write(response.content)
+    # Return XML object if saved to temp file
+    if save_path is None:
+        return ET.parse(save_file).getroot()
+    else:
+        return save_file
+def extract_body(xml_root):
+    """Extracts and returns the text content of the paper's body from Grobid XML."""
+    namespace = {"tei": "http://www.tei-c.org/ns/1.0"}  # Define TEI namespace
+    body_text = []
+    # Locate <body> in the XML structure
+    body = xml_root.find(".//tei:body", namespace)
+    if body is not None:
+        for p in body.findall(".//tei:p", namespace):  # Get all paragraphs inside <body>
+            if p.text:
+                body_text.append(p.text.strip())
+    return "\n".join(body_text)

core/paper.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# core/paper.py
+import os
+import uuid
+import re
+import fitz  # PyMuPDF
+import pdfplumber
+import hashlib
+import json
+import pandas as pd
+from config.constants import LogType
+import ast
+import streamlit as st
+import datetime
+from enum import Enum
+def _parse_url_field(value):
+    if isinstance(value, list):
+        return value
+    if isinstance(value, str):
+        try:
+            parsed = ast.literal_eval(value)
+            return parsed if isinstance(parsed, list) else [parsed]
+        except Exception:
+            return [value]
+    if pd.isna(value):
+        return []
+    return [value]
+class Paper:
+    def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], verbose=0):
+        # Metadata
+        self.title = title
+        self.venue = venue
+        self.year = year
+        self.pdf_url = pdf_url
+        # Optional ground truth links (e.g., from curated metadata)
+        self.urls_manual = _parse_url_field(urls_manual)
+        self.urls_auto = _parse_url_field(urls_auto)
+        self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id
+        self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf"
+        self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml"
+        self.zip_path = None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip"
+        # Internal state
+        self.logs = logs
+        self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual
+        self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto
+        self.verbose = verbose
+    def __repr__(self):
+        return f"<Paper: {self.title}>"
+    @classmethod
+    def from_url(cls, code_url, verbose):
+        # Supports both dicts and pandas Series
+        return cls(
+            urls_manual=code_url,
+            verbose=verbose
+        )
+    @classmethod
+    def from_raw(cls, row):
+        # Supports both dicts and pandas Series
+        return cls(
+            title=row.get("Title", ""),
+            venue=row.get("Venue", ""),
+            year=row.get("Year", ""),
+            pdf_url=row.get('PDF'),
+            urls_manual=row.get("Repository"),
+            code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")}
+        )
+    @classmethod
+    def from_row(cls, row):
+        # Supports both dicts and pandas Series
+        return cls(
+            title=row.get("title", ""),
+            venue=row.get("venue", ""),
+            year=row.get("year", ""),
+            paper_id=row.get('paper_id'),
+            pdf_url=row.get('pdf_url'),
+            urls_manual=json.loads(row.get("urls_manual")),
+            urls_auto=json.loads(row.get("urls_auto")),
+            code_repro_manual=json.loads(row.get("code_reproducibility_manual")),
+            code_repro_auto=json.loads(row.get("code_reproducibility_auto")),
+            logs=json.loads(row.get("logs", "[]"))
+        )
+    @property
+    def main_repo_url(self):
+        urls = [*self.urls_manual, *self.urls_auto]
+        github_links = [u for u in urls if "github.com" in u]
+        return github_links[0] if github_links else None
+    def _compute_id(self):
+        paper_name = self.title
+        if (not(pd.isna(self.pdf_url))):
+            paper_name += self.pdf_url
+        h = hashlib.sha256()
+        h.update(paper_name.encode("utf-8"))
+        return h.hexdigest()[:16]
+    def log(self, level, message):
+        self.logs.append({
+            "timestamp": datetime.datetime.utcnow().isoformat(),
+            "level": LogType[level.upper()],  # "ERROR", "WARNING", "NOTE", etc.
+            "message": message
+        })
+        if (self.verbose == 0):
+            return
+        show_tips = (self.verbose == 2) | (self.verbose == 4)
+        if ((self.verbose == 1) | (self.verbose == 2)):
+            show = print
+        if ((self.verbose == 3) | (self.verbose == 4)):
+            show = st.write
+        # Align line-break
+        if (log_text.startswith("\n")):
+            show("\n")
+            log_text = log_text.lstrip('\n')
+        # Only show tips in verbose mode 2 and 4
+        if ((level == "TITLE") & show_tips):
+            show(f"\n#### {log_text}")
+        if ((level == "TIP") & show_tips):
+            show(f"*{log_text}*")
+        if ((level == "LOG") & show_tips):
+            show(f"{log_text}")
+        if ((level == "ERROR")):
+            show(f"**{log_text}**")
+        if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")):
+            raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
+    def to_dict(self):
+        return {
+            "title": self.title,
+            "venue": self.venue,
+            "year": self.year,
+            "pdf_url": self.pdf_url,
+            "paper_id": self.paper_id,
+            "urls_auto": json.dumps(self.urls_auto),
+            "urls_manual": json.dumps(self.urls_manual),
+            "logs": json.dumps([
+                {"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]}
+                for log in self.logs
+            ]),
+            "code_reproducibility_manual": json.dumps(self.code_repro_manual),
+            "code_reproducibility_auto": json.dumps(self.code_repro_auto),
+        }

data/dump.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/fetch_miccai.py DELETED Viewed

@@ -1,60 +0,0 @@
-import pandas as pd
-import requests
-import re
-from multiprocessing import Pool, cpu_count
-from functools import partial
-# Function to process each URL
-def process_paper(year, url):
-    try:
-        paper_page = requests.get(url).text
-        # Find title
-        title_pattern = r'<title>(.*?)\s*</title>'
-        title_match = re.search(title_pattern, paper_page, re.DOTALL)
-        title = title_match.group(1)
-        # Find the code repository link
-        code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
-        code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
-        code_repo_link = code_repo_match.group(1) if code_repo_match else ""
-        # Find the dataset information
-        dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
-        dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
-        dataset_info = "Yes" if dataset_match else "No"
-        # Return a dictionary of the results
-        return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
-    except Exception as e:
-        print(f"Error processing {url}: {e}")
-        return None
-current_year = 2024  # Update with the current year
-MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
-MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
-years = [2021, 2022, 2023]
-# Set debug mode
-debug = False
-# Fetch all URLs for each year
-all_year_urls = []
-for i in range(len(MICCAI_pages)):
-    year_page = requests.get(MICCAI_pages[i]).text
-    print(year_page)
-    urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
-    all_year_urls.extend([(years[i], url) for url in urls])
-print(all_year_urls)
-# Parallel processing using Pool
-# if __name__ == "__main__":
-#     with Pool(processes=12) as pool:  # Use 12 processes
-#         results = pool.starmap(process_paper, all_year_urls)
-#     # Filter out any None results due to errors
-#     results = [result for result in results if result is not None]
-#     miccai = pd.DataFrame(results)
-#     # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
-#     miccai.to_csv('miccai.csv')

data/fetch_processed.py DELETED Viewed

@@ -1,31 +0,0 @@
-import csv
-import numpy as np
-import pandas as pd
-import re
-current_year = 2024
-MIDL_years = range(2018, current_year + 1, 1)
-custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
-for venue in custom_order:
-    df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
-    df = df.replace('\t', ' ', regex=True)
-    df.to_csv(f'data/{venue}.csv', sep="\t")
-# Store all evaluations here
-paper_dump = pd.DataFrame()
-# Official color codes for conferences
-MIDL_colors = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000"]
-for venue in custom_order:
-    with open(f'data/{venue}.csv') as file:
-        tsv_file = csv.reader(file, delimiter="\t")
-        for row in tsv_file:
-            if (row[0] == ""):
-                continue
-            if (row[1] == ""):
-                continue
-            paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]]})], ignore_index=True)
-paper_dump.to_csv(f'data/dump.csv', sep="\t")

data/fetch_zips.py DELETED Viewed

@@ -1,43 +0,0 @@
-import csv
-import numpy as np
-import sys
-import pandas as pd
-import re
-sys.path.append("./")
-from evaluations.utils import *
-token = os.getenv("githubToken")
-custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
-for venue in custom_order:
-    df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
-    df = df.replace('\t', ' ', regex=True)
-    df.to_csv(f'data/{venue}.csv', sep="\t")
-# Store all evaluations here
-paper_dump = pd.DataFrame()
-# Official color codes for conferences
-zip_idx = 0
-for venue in custom_order:
-    with open(f'data/{venue}.csv') as file:
-        tsv_file = csv.reader(file, delimiter="\t")
-        for row in tsv_file:
-            if (row[0] == ""):
-                continue
-            if (row[1] == ""):
-                continue
-            repo_url = row[4]
-            username, repo_name = decompose_url(repo_url)
-            repo_save_name = f"repo_{zip_idx}.zip"
-            repository_zip_name = f"data/test/{repo_save_name}"
-            log(0, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
-            fetch_repo(0, repo_url, repository_zip_name, token)
-            if (os.path.exists(repository_zip_name)):
-                paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]], "zip_idx": [ repository_zip_name ]})], ignore_index=True)
-                zip_idx += 1
-paper_dump.to_csv(f'data/zipfiles.csv', sep="\t")

data/zipfiles.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data_generation/fetch_processed.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import sys
+import os
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+import csv
+import numpy as np
+import pandas as pd
+import re
+import os
+from core.paper import Paper
+import sys
+import os
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+import csv
+import numpy as np
+import sys
+import pandas as pd
+import re
+from evaluations.utils import *
+import pandas as pd
+from evaluations.url import fetch_url
+from concurrent.futures import ThreadPoolExecutor
+import os
+from core.paper import Paper
+from core.conversion import download_repo
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import sys
+import os
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+import csv
+import numpy as np
+import sys
+import pandas as pd
+import re
+from evaluations.utils import *
+import pandas as pd
+from evaluations.url import fetch_url
+from concurrent.futures import ThreadPoolExecutor
+import os
+from core.paper import Paper
+from core.conversion import download_repo, pdf_to_grobid
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+from config.constants import VENUE_ORDER
+import pandas as pd
+import sys
+import os
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+from evaluations.url import fetch_url
+from concurrent.futures import ThreadPoolExecutor
+import os
+from core.paper import Paper
+from core.conversion import download_pdf
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+def get_urls_and_assign(paper):
+    try:
+        urls = fetch_url(paper.pdf_path)
+        paper.urls_auto = urls  # Just update this
+    except Exception as e:
+        paper.log("ERROR", str(e))
+    return paper
+def download_xml(paper):
+    try:
+        if (paper.pdf_path is None):
+            return paper
+        if (os.path.exists(paper.xml_path)):
+            paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.")
+            return paper
+        pdf_to_grobid(paper.pdf_path, paper.xml_path)
+        return paper
+    except Exception as e:
+        paper.log("ERROR", f"Repo download failed: {e}")
+    return paper
+max_workers = 6
+if __name__ == "__main__":
+    for venue in VENUE_ORDER:
+        df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
+        df = df.replace('\t', ' ', regex=True)
+        df = df.replace('[]', '')
+        df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
+    papers = []
+    for venue in VENUE_ORDER:
+        paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
+        paper_list["Venue"] = venue
+        for _, row in paper_list.iterrows():
+            if (row.iloc[0] == ""):
+                continue
+            if (row.iloc[1] == ""):
+                continue
+            papers.append(Paper.from_raw(row))
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs"))
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs"))
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos"))
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs"))
+    results = [p.to_dict() for p in papers]
+    results_df = pd.DataFrame(results)
+    results_df.to_csv("data/papers.csv", sep="\t", index=False)

{data → data_generation/paper_scraping}/fetch_arxiv.py RENAMED Viewed

@@ -1,10 +1,23 @@
 import pandas as pd
 import requests
 import pdfplumber
 import re
 from multiprocessing import Pool, cpu_count
 from functools import partial
 import os
 # Function to process each URL
 def process_arxiv_paper(article_link):
     try:
@@ -16,69 +29,70 @@ def process_arxiv_paper(article_link):
         article_id = article_link.split("/")[-1]
         pdf_url = f'https://arxiv.org/pdf/{article_id}'
-        response = requests.get(pdf_url)
-        if response.status_code == 200:
-            with open(f"{article_id}.pdf", 'wb') as file:
-                file.write(response.content)
-        if (response.status_code == 404):
-            print("Failed to fetch pdf")
-            return None
-        urls = []
-        link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
-        with pdfplumber.open(f"{article_id}.pdf") as pdf:
-            # Loop through all pages
-            for page_num, page in enumerate(pdf.pages):
-                # Extract text from the page
-                text = page.extract_text()
-                # Search for a specific word or phrase
-                found_urls = re.findall(link_pattern, text)
-                urls.extend(found_urls)
-        os.remove(f"{article_id}.pdf")
-        urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
-        print(urls)
-        url = urls[0] if len(urls) > 0 else ""
-        # Return a dictionary of the results
-        return {"venue": "arXiv", "title": title, "url": url, "year": year}
     except Exception as e:
         print(f"Error processing {article_link}: {e}")
         return None
-# Set debug mode
-debug = False
-# Fetch all URLs for each year
-all_year_urls = []
-page_size = 50
-search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
 articles = []
-for search_query in search_queries:
-    page = 0
-    while (page <= 100):
-        start_idx = page_size * page
-        url = f"{search_query}{start_idx}"
-        current_page = requests.get(url).text
-        pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
-        matches = re.findall(pattern, current_page)
-        if (len(matches) == 0):
-            break
-        else:
-            page += 1
-        articles += matches
-articles = np.unique(articles)
 # Parallel processing using Pool
 if __name__ == "__main__":
-    with Pool(processes=4) as pool:
-        results = pool.starmap(process_arxiv_paper, [[article] for article in articles])
     # Filter out any None results due to errors
     results = [result for result in results if result is not None]
     # Convert the list of dictionaries to a DataFrame
     arxiv = pd.DataFrame(results)
-    arxiv.to_csv('arxiv.csv')

 import pandas as pd
 import requests
 import pdfplumber
+import numpy as np
 import re
 from multiprocessing import Pool, cpu_count
 from functools import partial
+import urllib, urllib.request
 import os
+import sys
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map  # better for multiprocessing
+import feedparser
+import time
+from datetime import datetime
+from tqdm import tqdm
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+sys.path.append(ROOT_DIR)
 # Function to process each URL
 def process_arxiv_paper(article_link):
     try:
         article_id = article_link.split("/")[-1]
         pdf_url = f'https://arxiv.org/pdf/{article_id}'
+        urls = [] # fetch_url(pdf_url)
+        return {"title": title, "year": year, "pdf": pdf_url, "url": urls}
     except Exception as e:
         print(f"Error processing {article_link}: {e}")
         return None
+page_size = 100
+base_query = "http://export.arxiv.org/api/query"
+query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"
 articles = []
+start = 0
+max_empty_pages = 3  # stop early if we hit consecutive empty pages
+empty_pages = 0
 # Parallel processing using Pool
 if __name__ == "__main__":
+    print("Fetching arXiv article URLs...")
+    while True:
+        # Build URL with pagination
+        url = f"{base_query}?{query_params}&start={start}"
+        # Parse the Atom feed
+        feed = feedparser.parse(url)
+        entries = feed.entries
+        if not entries:
+            empty_pages += 1
+            print(f"⚠️  Empty page at start={start}. Consecutive empty pages: {empty_pages}")
+            if empty_pages >= max_empty_pages:
+                print("Stopping early due to repeated empty results.")
+                break
+            time.sleep(4)
+            start += page_size
+            continue
+        empty_pages = 0  # reset empty count on success
+        for entry in entries:
+            pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
+            if pub_date >= datetime(2018, 1, 1):
+                articles.append(entry.link)
+        # Log progress
+        print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")
+        # Stop if fewer than full page — probably the last one
+        if len(entries) < page_size:
+            print("Reached last page of results.")
+            break
+        start += page_size
+        time.sleep(4)  # Respect rate limit
+    articles = np.unique(articles)
+    results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")
     # Filter out any None results due to errors
     results = [result for result in results if result is not None]
     # Convert the list of dictionaries to a DataFrame
     arxiv = pd.DataFrame(results)
+    arxiv.to_csv('data/raw/arxiv.csv')

data_generation/paper_scraping/fetch_miccai.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pandas as pd
+import requests
+import re
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map
+from multiprocessing import cpu_count
+# --- Parse a single paper page ---
+def process_paper(year, url):
+    try:
+        paper_page = requests.get(url).text
+        # Title
+        title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL)
+        title = title_match.group(1).strip() if title_match else ""
+        # Code repo link
+        code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL)
+        code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""
+        # Dataset info
+        dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL)
+        dataset_info = "Yes" if dataset_match else "No"
+        return {
+            "title": title,
+            "year": year,
+            "url": code_repo_link,
+            "public": dataset_info
+        }
+    except Exception as e:
+        print(f"Error processing {url}: {e}")
+        return None
+# --- Main Execution ---
+if __name__ == "__main__":
+    MICCAI_pages = [
+        "https://miccai2021.org/openaccess/paperlinks/",
+        "https://conferences.miccai.org/2022/papers/",
+        "https://conferences.miccai.org/2023/papers/",
+        "https://papers.miccai.org/miccai-2024/"
+    ]
+    MICCAI_root = [
+        "https://miccai2021.org/openaccess/paperlinks/",
+        "https://conferences.miccai.org",
+        "https://conferences.miccai.org",
+        "https://papers.miccai.org"
+    ]
+    years = [2021, 2022, 2023, 2024]
+    all_year_urls = []
+    print("🔍 Fetching paper URLs by year...")
+    for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
+        try:
+            response = requests.get(MICCAI_pages[i])
+            year_page = response.text
+            if years[i] == 2024:
+                matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
+                urls = [MICCAI_root[i] + match for match in matches]
+            else:
+                urls = [
+                    MICCAI_root[i] + line.split('href="')[1].split('"')[0]
+                    for line in year_page.split('\n')
+                    if "&bullet" in line and 'href="' in line
+                ]
+            all_year_urls.extend([(years[i], url) for url in urls])
+        except Exception as e:
+            print(f"Failed to fetch year {years[i]}: {e}")
+    print(f"📄 Total papers found: {len(all_year_urls)}")
+    # --- Parallel scrape each paper page ---
+    print("⚙️ Processing paper metadata...")
+    results = process_map(
+        process_paper,
+        [item[0] for item in all_year_urls],
+        [item[1] for item in all_year_urls],
+        max_workers=12,
+        chunksize=1,
+        desc="Parsing Papers"
+    )
+    results = [r for r in results if r is not None]
+    miccai = pd.DataFrame(results)
+    miccai.to_csv('data/raw/miccai.csv', index=False)
+    print("✅ Saved to data/miccai.csv")

{data → data_generation/paper_scraping}/fetch_nature.py RENAMED Viewed

@@ -1,19 +1,23 @@
 import pandas as pd
 import requests
 import re
 from multiprocessing import Pool, cpu_count
-from functools import partial
 # Function to process each URL
 def process_nature_paper(article_link):
     try:
-        url = f'https://www.nature.com/articles/{article_link}'
-        article_text = requests.get(url).text
         pattern = r'Code availability.*?<a href="([^"]+)"'
         matches = re.findall(pattern, article_text, re.DOTALL)
         urls = [link for link in matches if "github" in link]
-        url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
         year = re.findall(r'datetime="(\d{4})', article_text)[0]
         # # Find title
@@ -25,10 +29,10 @@ def process_nature_paper(article_link):
         dataset_info = "Yes" if (len(matches) > 0) else "No"
         # # Return a dictionary of the results
-        return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}
     except Exception as e:
-        print(f"Error processing {url}: {e}")
         return None
 # Set debug mode
@@ -36,33 +40,29 @@ debug = False
 # Fetch all URLs for each year
 all_year_urls = []
-search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
 articles = []
-for search_query in search_queries:
     page = 1
-    while (page <= 100):
         url = f"{search_query}{page}"
         current_page = requests.get(url).text
         pattern = r'href="/articles/([^"]+)"'
         matches = re.findall(pattern, current_page)
-        if (len(matches) == 0):
             break
         else:
             page += 1
-        articles += matches
 articles = np.unique(articles)
 # Parallel processing using Pool
 if __name__ == "__main__":
-    with Pool(processes=12) as pool:
-        results = pool.starmap(process_nature_paper, [[article] for article in articles])
-    # Filter out any None results due to errors
     results = [result for result in results if result is not None]
-    # Convert the list of dictionaries to a DataFrame
     nature = pd.DataFrame(results)
     nature = nature[['title', 'year', 'pdf', 'url', 'public']]
-    nature.to_csv('nature.csv')

 import pandas as pd
 import requests
+import os
 import re
 from multiprocessing import Pool, cpu_count
+import numpy as np
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map  # better for multiprocessing
 # Function to process each URL
 def process_nature_paper(article_link):
     try:
+        pdf_url = f'https://www.nature.com/articles/{article_link}'
+        article_text = requests.get(pdf_url).text
         pattern = r'Code availability.*?<a href="([^"]+)"'
         matches = re.findall(pattern, article_text, re.DOTALL)
         urls = [link for link in matches if "github" in link]
+        # url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
         year = re.findall(r'datetime="(\d{4})', article_text)[0]
         # # Find title
         dataset_info = "Yes" if (len(matches) > 0) else "No"
         # # Return a dictionary of the results
+        return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info}
     except Exception as e:
+        print(f"Error processing {pdf_url}: {e}")
         return None
 # Set debug mode
 # Fetch all URLs for each year
 all_year_urls = []
+search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="]
 articles = []
+for search_query in tqdm(search_queries, desc="Search Queries"):
     page = 1
+    while page <= 100:
         url = f"{search_query}{page}"
         current_page = requests.get(url).text
         pattern = r'href="/articles/([^"]+)"'
         matches = re.findall(pattern, current_page)
+        if not matches:
             break
         else:
             page += 1
+            articles += matches
 articles = np.unique(articles)
 # Parallel processing using Pool
 if __name__ == "__main__":
+    results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles")
     results = [result for result in results if result is not None]
     nature = pd.DataFrame(results)
     nature = nature[['title', 'year', 'pdf', 'url', 'public']]
+    nature.to_csv('data/raw/nature.csv')

evaluations/documentation.py CHANGED Viewed

@@ -1,14 +1,15 @@
-from .utils import log,fetch_code
 import re
 import numpy as np
-def is_applicable(verbose, llm, readme):
     res_training = "NA"
     res_evaluation = "NA"
     res_weights = "NA"
     if (llm):
-        log(verbose, "TITLE", "\nChecking what parts of the evaluations are applicable...")
         res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
         res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
         res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
@@ -16,31 +17,28 @@ def is_applicable(verbose, llm, readme):
     applicable = f"{res_training}/{res_evaluation}/{res_weights}"
     return applicable
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nEvaluating code documentation...")
   overall = "No"
   code_to_comment_ratio = get_code_to_comment_ratio(zip)
-  log(verbose, "LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
-  if (readme):
       non_empty_rows = [row for row in readme.split("\n") if row != ""]
       if (len(non_empty_rows) < 5):
-          log(verbose, "ERROR", "Readme file has very few lines")
           if (llm):
               code = fetch_code(zip)
               if (llm):
                   summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
-                  log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
           return overall
-      if (count_code_lines(non_empty_rows) > 5):
-          log(verbose, "LOG", "Readme file contains python examples.")
-          overall = "Yes"
-          return overall
       if (llm):
@@ -50,26 +48,30 @@ def evaluate(verbose, llm, zip, readme):
           and evaluate the proposed model?'
           llm.predict("HELP", prompt)
-      manual_fail = False
       if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
-          log(verbose, "ERROR", "Readme file missing training information")
-          overall = "No"
-      if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
-          log(verbose, "ERROR", "Readme file missing testing information")
-          overall = "No"
-      if ((len(re.findall("example", readme, re.IGNORECASE)) > 0)):
-          log(verbose, "LOG", "Readme file contains links to examples")
-          overall = "Yes"
-      if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
-          (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
-          (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
-          log(verbose, "ERROR", "Readme file missing information about package dependencies")
-          overall = "No"
-  return overall
 def count_comment_lines(lines):
     # Initialize counters

+from .utils import fetch_code
 import re
 import numpy as np
+from core.conversion import noop_logger
+def is_applicable(llm, readme, log_fn=noop_logger):
     res_training = "NA"
     res_evaluation = "NA"
     res_weights = "NA"
     if (llm):
+        log_fn("TITLE", "\nChecking what parts of the evaluations are applicable...")
         res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
         res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
         res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
     applicable = f"{res_training}/{res_evaluation}/{res_weights}"
     return applicable
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nEvaluating code documentation...")
   overall = "No"
   code_to_comment_ratio = get_code_to_comment_ratio(zip)
+  log_fn("LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
+  result = { "dependencies": "No", "training": "No", "evaluation": "No", "weights": "No", "scripts": "No" }
+  for readme in readmes:
       non_empty_rows = [row for row in readme.split("\n") if row != ""]
       if (len(non_empty_rows) < 5):
+          log_fn("ERROR", "Readme file has very few lines")
           if (llm):
               code = fetch_code(zip)
               if (llm):
                   summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
+                  log_fn("LOG", f"Based on the code, your readme file could be something like...\n{summary}")
           return overall
+      if (count_code_lines(non_empty_rows) > 2):
+          log_fn("LOG", "Readme file contains python examples.")
+          result["scripts"] = "Yes"
       if (llm):
           and evaluate the proposed model?'
           llm.predict("HELP", prompt)
+      if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
+          (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
+          (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
+          log_fn("ERROR", "Readme file missing information about package dependencies")
+      else:
+          result["dependencies"] = "Yes"
       if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
+          log_fn("ERROR", "Readme file missing training information")
+      else:
+          result["training"] = "Yes"
+      if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
+          log_fn("ERROR", "Readme file missing testing information")
+      else:
+          result["evaluating"] = "Yes"
+      if ((len(re.findall("example", readme, re.IGNORECASE)) == 0)):
+          log_fn("LOG", "Readme file contains no links to examples")
+      else:
+          result["evaluating"] = "Yes"
+  score = np.sum(np.array(list(result.values()), dtype=str) == "Yes")
+  return "Yes" if score >= 2 else "No"
 def count_comment_lines(lines):
     # Initialize counters

evaluations/license.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from .utils import log
 import re
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nEvaluating repository licensing...")
   overall = "No"
-  license_files = [license for license in zip.namelist() if ((("LICENSE" in license) | ("license" in license)) & (len(license.split("/")) == 2))]
   if (len(license_files) > 0):
       license = zip.open(license_files[0]).read().decode("utf-8")
       ans = [row for row in license.split("\n") if row != ""]
@@ -13,18 +13,18 @@ def evaluate(verbose, llm, zip, readme):
           license = license
           prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
           ans = llm.predict("HELP", prompt)
-          log(verbose, "LOG", f"Found license: {ans}")
       else:
-          log(verbose, "LOG", f"Found license file: {license_files[0]}")
       overall = "Yes"
       return overall
-  if (readme):
-      if ("License" in readme):
-          log(verbose, "LOG", "License found in README.")
-          overall = "Yes"
-          return overall
-  log(verbose, "ERROR", "LICENSE file not found.")
   return overall

 import re
+from core.conversion import noop_logger
+def evaluate(llm, zip, readme, log_fn=noop_logger):
+  log_fn("TITLE", "\nEvaluating repository licensing...")
   overall = "No"
+  license_files = [license_path for license_path in zip.namelist() if ((("license" in license_path.lower())) & (len(license_path.split("/")) == 2))]
   if (len(license_files) > 0):
       license = zip.open(license_files[0]).read().decode("utf-8")
       ans = [row for row in license.split("\n") if row != ""]
           license = license
           prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
           ans = llm.predict("HELP", prompt)
+          log_fn("LOG", f"Found license: {ans}")
       else:
+          log_fn("LOG", f"Found license file: {license_files[0]}")
       overall = "Yes"
       return overall
+  for readme_file in readme:
+    if ("license" in readme_file.lower()):
+        log_fn("LOG", "License found in README.")
+        overall = "Yes"
+        return overall
+  log_fn("ERROR", "LICENSE file not found.")
   return overall

evaluations/pitfalls.py CHANGED Viewed

@@ -1,13 +1,14 @@
-from .utils import log, fetch_code
 import re
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
   codebase = fetch_code(zip)
   if (llm):
     for code in codebase:
       pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
       if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
-        log(verbose, "ERROR", f"Found possible issues in {code}")
-        log(verbose, "LOG", llm.predict("PITFALL", f"File name {code} file {codebase[code]}\n Can you find any signs of common pitfalls in this code?"))

+from .utils import fetch_code
 import re
+from core.conversion import noop_logger
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nLooking for common pitfalls (in development)...")
   codebase = fetch_code(zip)
   if (llm):
     for code in codebase:
       pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
       if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
+        log_fn("ERROR", f"Found possible issues in {code}")
+        log_fn("LOG", llm.predict("PITFALL", f"File name {code} file {codebase[code]}\n Can you find any signs of common pitfalls in this code?"))

evaluations/repo_evaluations.py CHANGED Viewed

@@ -2,84 +2,71 @@ import pandas as pd
 import os
 from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
 from evaluations.utils import *
 import zipfile
 import os
 import numpy as np
 from huggingface_hub import InferenceClient
-def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
     try:
         if (not(llm)):
-            log(verbose, "LOG", "No LLM will be used for the evaluation.")
-        results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
-        if ((title != None) & (year != None) & (title != "") & (year != "")):
-            res = fetch_openalex(verbose, title, year)
-            if ((res != None)):
-                res = res["results"]
-                if (len(res) > 0):
-                    res = res[0]
-                    results["pred_citations"] = res["cited_by_count"]
-        if (get_api_link(repo_url) != ""):
-            results["pred_valid"] = True
-        else:
-            return results
-        username, repo_name = decompose_url(repo_url)
-        # If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
-        if (zip is None):
-            token = os.getenv("githubToken")
-            repository_zip_name = "data/repo.zip"
-            log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
-            fetch_repo(verbose, repo_url, repository_zip_name, token)
-            if (not(os.path.exists(repository_zip_name))):
-                results["pred_live"] = "No"
-                return results
-            results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
-            zip = zipfile.ZipFile(repository_zip_name)
-        readme = fetch_readme(zip)
-        results["NA"] = documentation.is_applicable(verbose, llm, readme)
-        results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
         if (len(zip.namelist()) <= 2):
-            log(verbose, "LOG", "The repository is empty.")
-        results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
-        results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
-        results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
-        results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
-        results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
-        results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
-        pitfalls.evaluate(verbose, llm, zip, readme)
-        return results
     except Exception as e:
-        log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
-        results["pred_live"] = "No"
-        return results
-def full_evaluation():
-  paper_dump = pd.read_csv("data/zipfiles.csv", sep="\t")
-  full_results = []
-  for idx, row in paper_dump.iterrows():
-      if (pd.isna(row["url"]) | (row["url"] == "")):
-          continue
-      print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
-      result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
-      for column in result.keys():
-          row[column] = result[column]
-      full_results.append(row)
-  return pd.DataFrame(full_results)

 import os
 from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
 from evaluations.utils import *
+from core.conversion import fetch_repo, decompose_url
 import zipfile
+import csv
 import os
 import numpy as np
 from huggingface_hub import InferenceClient
+from concurrent.futures import ThreadPoolExecutor
+from core.conversion import noop_logger
+token = os.getenv("githubToken")
+def evaluate(llm, paper, log_fn=noop_logger):
+    repo_url = paper.main_repo_url
+    title = paper.title
+    year = paper.year
+    zip=zipfile.ZipFile(paper.zip_path)
     try:
         if (not(llm)):
+            log_fn("LOG", "No LLM will be used for the evaluation.")
+        paper.code_repro_auto = { "live": "Yes", "dependencies": None, "training": None, "evaluation": None, "weights": None, "readme": None, "license": None, "stars": None, "citations": None, "valid": False}
+        # if ((title != None) & (year != None) & (title != "") & (year != "")):
+        #     res = fetch_openalex(title, year, log_fn=log_fn)
+        #     if ((res != None)):
+        #         res = res["results"]
+        #         if (len(res) > 0):
+        #             res = res[0]
+        #             paper.code_repro_auto["citations"] = res["cited_by_count"]
+        # if (get_api_link(repo_url) != ""):
+        #     paper.code_repro_auto["valid"] = True
+        # else:
+        #     return paper.code_repro_auto
+        # paper.code_repro_auto["stars"] = fetch_repo_stars(repo_url, token, log_fn)
+        readmes = fetch_readmes(zip)
+        paper.code_repro_auto["NA"] = documentation.is_applicable(llm, readmes, log_fn)
+        paper.code_repro_auto["license"] = license.evaluate(llm, zip, readmes, log_fn)
         if (len(zip.namelist()) <= 2):
+            log_fn("LOG", "The repository is empty.")
+        paper.code_repro_auto["dependencies"] = requirements.evaluate(llm, zip, readmes, log_fn)
+        paper.code_repro_auto["training"] = training.evaluate(llm, zip, readmes, log_fn)
+        paper.code_repro_auto["evaluation"] = validating.evaluate(llm, zip, readmes, log_fn)
+        paper.code_repro_auto["weights"] = weights.evaluate(llm, zip, readmes, log_fn)
+        paper.code_repro_auto["readme"] = documentation.evaluate(llm, zip, readmes, log_fn)
+        paper.code_repro_auto["codetocomment"] = documentation.get_code_to_comment_ratio(zip)
+        pitfalls.evaluate(llm, zip, readmes, log_fn)
+        return paper
     except Exception as e:
+        log_fn("ERROR", "Evaluating repository failed: " + str(e))
+        paper.code_repro_auto["live"] = "No"
+        return paper
+def process_row(paper):
+    if ((paper.zip_path is None) or (not(os.path.exists(paper.zip_path)))):
+        paper.log("ERROR", "Zip file doesn't exist")
+        return paper
+    paper = evaluate(None, paper, paper.log)
+    return paper

evaluations/requirements.py CHANGED Viewed

@@ -1,24 +1,26 @@
-from .utils import log
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for package dependencies for running the code...")
   overall = "No"
   scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
   files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
-  files = [file_path for file_path in files if len(file_path.split("/")) == 2]
   for file in files:
-      log(verbose, "LOG", f"Found requirements file: {file}")
       requirements = zip.open(file).read().decode("utf-8")
-      overall = "Yes"
       if (len(requirements.split("\n")) < 5):
-          log(verbose, "ERROR", "Requirements file contains too few lines.")
-          overall = "No"
-  if (readme):
-      if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
-          log(verbose, "LOG", "Found dependencies in README file")
-          overall = "Yes"
   return overall

+from core.conversion import noop_logger
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nLooking for package dependencies for running the code...")
   overall = "No"
   scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
   files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
+  # files = [file_path for file_path in files if len(file_path.split("/")) == 2]
   for file in files:
+      log_fn("LOG", f"Found requirements file: {file}")
       requirements = zip.open(file).read().decode("utf-8")
       if (len(requirements.split("\n")) < 5):
+          log_fn("ERROR", "Requirements file contains too few lines.")
+          continue
+      overall = "Yes"
+  for readme in readmes:
+    if (readme):
+        if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
+            log_fn("LOG", "Found dependencies in README file")
+            overall = "Yes"
   return overall

evaluations/training.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from .utils import log
 import re
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for code to train the model...")
   overall = "No"
@@ -23,15 +23,15 @@ def evaluate(verbose, llm, zip, readme):
       for framework, regex_list in patterns.items():
           for pattern in regex_list:
               if re.search(pattern, code):
-                  log(verbose, "LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
                   overall = "Yes"
-  if (readme):
-      if (("train" in readme)):
-          log(verbose, "LOG", "Found something about training in README file")
-          overall = "Yes"
   if (overall == "No"):
-    log(verbose, "ERROR", "Found no code for training the model.")
   return overall

 import re
+from core.conversion import noop_logger
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nLooking for code to train the model...")
   overall = "No"
       for framework, regex_list in patterns.items():
           for pattern in regex_list:
               if re.search(pattern, code):
+                  log_fn("LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
                   overall = "Yes"
+  for readme in readmes:
+    if (readme):
+        if (("train" in readme)):
+            log_fn("LOG", "Found something about training in README file")
+            overall = "Yes"
   if (overall == "No"):
+    log_fn("ERROR", "Found no code for training the model.")
   return overall

evaluations/url.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pandas as pd
+import os
+import zipfile
+import os
+import re
+from uuid import uuid4
+import pdfplumber
+import numpy as np
+from urllib.request import urlretrieve
+import xml.etree.ElementTree as ET
+import re
+import fitz  # PyMuPDF
+def get_fitz_urls(pdf_path):
+    doc = fitz.open(pdf_path)
+    urls = []
+    for page in doc:
+        for link in page.get_links():
+            if 'uri' in link:
+                urls.append(link['uri'])
+    return urls
+NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}
+def find_pattern_in_xml(root, pattern):
+    """
+    Recursively search for a regex pattern in all text fields of an XML tree.
+    :param root: The root Element of the XML tree
+    :param pattern: The regex pattern to search for
+    :return: A list of matching strings
+    """
+    matches = []
+    regex = re.compile(pattern)
+    # Check element text
+    if root.text:
+        matches.extend(regex.findall(root.text))
+    # Check element attributes
+    for attr_value in root.attrib.values():
+        matches.extend(regex.findall(attr_value))
+    # Recursively search in children
+    for child in root:
+        matches.extend(find_pattern_in_xml(child, pattern))
+    return matches
+def fetch_url(pdf_path):
+    if (pdf_path is None):
+        raise ValueError("Pdf has no path")
+    urls = []
+    link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b"
+    # if (method == "plumber"):
+    full_text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            # Extract text from the page and normalize spaces
+            text = page.extract_text()
+            if text:
+                full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "
+        # Find all URLs in the combined text
+        found_urls = re.findall(link_pattern, full_text)
+        urls.extend(found_urls)
+    # elif (method == "grobid"):
+    # paper = pdf_to_grobid(file_name)
+    # found_urls = find_pattern_in_xml(paper, link_pattern)
+    # urls.extend(found_urls)
+    # os.remove(file_name)
+    # elif (method == "fitz")
+    fitz_urls = get_fitz_urls(pdf_path)
+    urls.extend(fitz_urls)
+    urls = np.unique(urls)
+    urls = [s for s in urls if "/" in s]
+    urls = [s for s in urls if "git" in s]
+    # else:
+    #     raise Exception("Method unknown")
+    return urls

evaluations/utils.py CHANGED Viewed

@@ -4,7 +4,8 @@ import time
 import os
 import zipfile
 import json
-import streamlit as st
 def fetch_code(zip_file):
     zip_content_dict = {}
@@ -14,27 +15,8 @@ def fetch_code(zip_file):
             zip_content_dict[file_name] = file_content
     return zip_content_dict
-def get_api_link(url):
-    username, repo_name = decompose_url(url)
-    if (username == None):
-        return ""
-    return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
-def decompose_url(url):
-    try:
-        url = url.split("github.com")[1]
-        url = url.strip(".")
-        url = url.split(".git")[0]
-        url = url.strip("/")
-        parts = url.split("/")
-        username = parts[0]
-        repo_name = parts[1]
-        return username, repo_name
-    except:
-        return None, None
-def fetch_repo_stars(verbose, repo_url, token):
     headers = {"Authorization": f"token {token}"}
     api_url = get_api_link(repo_url)
     api_url = api_url.replace("/zipball/", "")
@@ -45,38 +27,14 @@ def fetch_repo_stars(verbose, repo_url, token):
     if response.status_code == 200:
         return json.loads(response.content)["stargazers_count"]
     if (response.status_code == 404):
-        log(verbose, "ERROR", "Repository private.")
-def fetch_repo(verbose, repo_url, repo_name, token):
-    if (os.path.exists(repo_name)):
-        os.remove(repo_name)
-    if ("github.com" not in repo_url):
-        log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
-        return
-    headers = {"Authorization": f"token {token}"}
-    api_url = get_api_link(repo_url)
-    if (api_url == ""):
-        log(verbose, "ERROR", f"Failed to parse the URL, please evaluate manually ({repo_url}).")
-        return
-    # Sending GET request to GitHub API
-    response = requests.get(api_url, headers=headers)
-    if response.status_code == 200:
-        with open(repo_name, 'wb') as file:
-            file.write(response.content)
-    if (response.status_code == 404):
-        log(verbose, "ERROR", "Repository private / Link broken.")
-def fetch_readme(zip):
-    readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
-    readme = ""
     for readme_file in readme_files:
-        readme += zip.open(readme_file).read().decode("utf-8") + "\n\n"
-    return readme
 def fetch_license(zip):
     license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
@@ -85,7 +43,7 @@ def fetch_license(zip):
         license = zip.open(license_files[0]).read().decode("utf-8")
     return license
-def fetch_openalex(verbose, paper_name, year):
     api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
     response = requests.get(api_url)
@@ -93,36 +51,7 @@ def fetch_openalex(verbose, paper_name, year):
     if response.status_code == 200:
         return response.json()
     else:
-        log(verbose, "WARNING", "Could not find OpenAlex information for paper.")
-def log(verbose, log_type, log_text, hf=False):
-    if (verbose == 0):
-        return
-    show_tips = (verbose == 2) | (verbose == 4)
-    if ((verbose == 1) | (verbose == 2)):
-        show = print
-    if ((verbose == 3) | (verbose == 4)):
-        show = st.write
-    # Align line-break
-    if (log_text.startswith("\n")):
-        show("\n")
-        log_text = log_text.lstrip('\n')
-    # Only show tips in verbose mode 2 and 4
-    if ((log_type == "TITLE") & show_tips):
-        show(f"\n#### {log_text}")
-    if ((log_type == "TIP") & show_tips):
-        show(f"*{log_text}*")
-    if ((log_type == "LOG") & show_tips):
-        show(f"{log_text}")
-    if ((log_type == "ERROR")):
-        show(f"**{log_text}**")
-    if ((log_type != "TIP") & (log_type != "LOG") & (log_type != "ERROR") & (log_type != "TITLE")):
-        raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
-def init_llm(verbose):
-  log(verbose, "LOG", "Initializing LLM...")

 import os
 import zipfile
 import json
+from core.conversion import get_api_link
+from core.conversion import noop_logger
 def fetch_code(zip_file):
     zip_content_dict = {}
             zip_content_dict[file_name] = file_content
     return zip_content_dict
+def fetch_repo_stars(repo_url, token, log_fn=noop_logger):
     headers = {"Authorization": f"token {token}"}
     api_url = get_api_link(repo_url)
     api_url = api_url.replace("/zipball/", "")
     if response.status_code == 200:
         return json.loads(response.content)["stargazers_count"]
     if (response.status_code == 404):
+        log_fn("ERROR", "Repository private.")
+def fetch_readmes(zip):
+    readme_files = [readme for readme in zip.namelist() if (readme.lower().endswith("readme.md") & (len(readme.split("/")) == 2))]
+    readmes = []
     for readme_file in readme_files:
+        readmes.append(zip.open(readme_file).read().decode("utf-8"))
+    return readmes
 def fetch_license(zip):
     license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
         license = zip.open(license_files[0]).read().decode("utf-8")
     return license
+def fetch_openalex(paper_name, year, log_fn=noop_logger):
     api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
     response = requests.get(api_url)
     if response.status_code == 200:
         return response.json()
     else:
+        log_fn("WARNING", "Could not find OpenAlex information for paper.")
+def init_llm(log_fn=noop_logger):
+  log_fn("LOG", "Initializing LLM...")

evaluations/validating.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from .utils import log
 import re
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for examples for running the model...")
   overall = "No"
   patterns = {
       'tensorflow': [
@@ -23,14 +23,15 @@ def evaluate(verbose, llm, zip, readme):
       for framework, regex_list in patterns.items():
           for pattern in regex_list:
               if re.search(pattern, code):
-                  log(verbose, "LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
                   overall = "Yes"
-  if (readme):
-      if ((len(re.findall("testing", readme)) > 0)):
-              log(verbose, "LOG", "Found information about evaluations in readme")
-              overall = "Yes"
   if (overall == "No"):
-    log(verbose, "ERROR", "Found no code for evaluating the model.")
   return overall

 import re
+from core.conversion import noop_logger
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nLooking for examples for running the model...")
   overall = "No"
   patterns = {
       'tensorflow': [
       for framework, regex_list in patterns.items():
           for pattern in regex_list:
               if re.search(pattern, code):
+                  log_fn("LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
                   overall = "Yes"
+  for readme in readmes:
+    if (readme):
+        if ((len(re.findall("testing", readme)) > 0)):
+                log_fn("LOG", "Found information about evaluations in readme")
+                overall = "Yes"
   if (overall == "No"):
+    log_fn("ERROR", "Found no code for evaluating the model.")
   return overall

evaluations/weights.py CHANGED Viewed

@@ -1,52 +1,53 @@
-from .utils import log
 import re
-def evaluate(verbose, llm, zip, readme):
-  log(verbose, "TITLE", "\nLooking for pre-trained model weights...")
   overall = "No"
   files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
   if (len(files) > 0):
-      log(verbose, "LOG", f"Found model weights: {files}")
       overall = "Yes"
       return overall
-  if (readme):
-      url_pattern = r'(https?://[^\s]+)'
-      urls = re.findall(url_pattern, readme)
-      if (len([url for url in urls if "pth" in url]) > 0):
-          log(verbose, "LOG", "Found a link to pre-trained weights in readme")
-          overall = "Yes"
-          return overall
-      readme_lines = readme.split("\n")
-      if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
-          log(verbose, "LOG", "Found a link for 'pretrained' something in readme")
-          overall = "Yes"
-          return overall
-      if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
-          log(verbose, "LOG", "Found a link for 'pre-trained' something in readme")
-          overall = "Yes"
-          return overall
-      if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
-          log(verbose, "LOG", "Found a link for 'weight' something in readme")
-          overall = "Yes"
-          return overall
-      if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
-          log(verbose, "LOG", "Found a link for 'model' something in readme")
-          overall = "Yes"
-          return overall
-      if (llm):
-          prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
-          ans = llm.predict("STRICT", prompt)
-          if (("Yes" in ans) & ("No" not in ans)):
-              log(verbose, "LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
-              overall = "Yes"
-              return overall
-  log(verbose, "ERROR", "Found no pre-trained model weights.")
   return overall

 import re
+from core.conversion import noop_logger
+def evaluate(llm, zip, readmes, log_fn=noop_logger):
+  log_fn("TITLE", "\nLooking for pre-trained model weights...")
   overall = "No"
   files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
   if (len(files) > 0):
+      log_fn("LOG", f"Found model weights: {files}")
       overall = "Yes"
       return overall
+  for readme in readmes:
+    if (readme):
+        url_pattern = r'(https?://[^\s]+)'
+        urls = re.findall(url_pattern, readme)
+        if (len([url for url in urls if "pth" in url]) > 0):
+            log_fn("LOG", "Found a link to pre-trained weights in readme")
+            overall = "Yes"
+            return overall
+        readme_lines = readme.split("\n")
+        if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
+            log_fn("LOG", "Found a link for 'pretrained' something in readme")
+            overall = "Yes"
+            return overall
+        if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
+            log_fn("LOG", "Found a link for 'pre-trained' something in readme")
+            overall = "Yes"
+            return overall
+        if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
+            log_fn("LOG", "Found a link for 'weight' something in readme")
+            overall = "Yes"
+            return overall
+        if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
+            log_fn("LOG", "Found a link for 'model' something in readme")
+            overall = "Yes"
+            return overall
+        if (llm):
+            prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
+            ans = llm.predict("STRICT", prompt)
+            if (("Yes" in ans) & ("No" not in ans)):
+                log_fn("LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
+                overall = "Yes"
+                return overall
+  log_fn("ERROR", "Found no pre-trained model weights.")
   return overall

full_eval.py CHANGED Viewed

@@ -1,4 +1,19 @@
-from evaluations.repo_evaluations import full_evaluation
-res = full_evaluation()
-res.to_csv("data/results.csv", sep="\t", index=False)

+import pandas as pd
+from evaluations.repo_evaluations import process_row
+from evaluations.utils import *
+from concurrent.futures import ProcessPoolExecutor
+from tqdm import tqdm
+from core.paper import Paper
+if __name__ == "__main__":
+    paper_dump = pd.read_csv("data/urls.csv", sep="\t")
+    max_workers = 12
+    papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
+    # papers = [paper for paper in papers if paper.main_repo_url == "https://github.com/AsukaDaisuki/MAT"]
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        papers = list(tqdm(executor.map(process_row, papers), total=len(papers), desc="Running repo evaluations"))
+    results = [p.to_dict() for p in papers]
+    results_df = pd.DataFrame(results)
+    results_df.to_csv("data/results.csv", sep="\t", index=False)

midl_summary.py DELETED Viewed

@@ -1,57 +0,0 @@
-import os
-import pandas as pd
-import numpy as np
-compare_to_gt = True
-ground_truth = pd.read_csv("data/zipfiles.csv", sep="\t")
-results = pd.read_csv("data/results.csv", sep="\t")
-verbose = 0
-eval_readme = []
-eval_training = []
-eval_evaluating = []
-eval_licensing = []
-eval_weights = []
-eval_dependencies = []
-full_results = []
-for (index1, row1), (index2, row2) in zip(ground_truth.iterrows(), results.iterrows()):
-    if (pd.isna(row1["training"])):
-        continue
-    print(f"\nEvaluating {index1+1} out of {len(ground_truth.index)} papers...")
-    print(f'Paper title - "{row1["title"]}" ({row1["year"]})')
-    print(f'Repository link - {row1["url"]}')
-    if ((not(pd.isna(row1["dependencies"]))) & (row2["pred_dependencies"] is not None)):
-        eval_dependencies.append(row2["pred_dependencies"] == row1["dependencies"])
-        if (row2["pred_dependencies"] != row1["dependencies"]):
-            print(f"Dependencies acc. - {row2['pred_dependencies']} (GT:{row1['dependencies']})")
-    if ((not(pd.isna(row1["training"]))) & (row2["pred_dependencies"] is not None)):
-        eval_training.append(row1["training"] == row2["pred_training"])
-        if (row1["training"] != row2["pred_training"]):
-            print(f"Training acc. -{row2['pred_training']} (GT:{row1['training']})")
-    if ((not(pd.isna(row1["evaluation"]))) & (row2["pred_dependencies"] is not None)):
-        eval_evaluating.append(row1["evaluation"] == row2["pred_evaluation"])
-        if (row1["evaluation"] != row2["pred_evaluation"]):
-            print(f"Evaluating acc. - {row2['pred_evaluation']} (GT:{row1['evaluation']})")
-    if ((not(pd.isna(row1["weights"]))) & (row2["pred_dependencies"] is not None)):
-        eval_weights.append(row1["weights"] == row2["pred_weights"])
-        if (row1["weights"] != row2["pred_weights"]):
-            print(f"Weights acc. - {row2['pred_weights']} (GT:{row1['weights']})")
-    if ((not(pd.isna(row1["readme"]))) & (row2["pred_dependencies"] is not None)):
-        eval_readme.append(row1["readme"] == row2["pred_readme"])
-        if (row1["readme"] != row2["pred_readme"]):
-            print(f"README acc. - {row2['pred_readme']} (GT:{row1['readme']})")
-    if ((not(pd.isna(row1["license"]))) & (row2["pred_dependencies"] is not None)):
-        eval_licensing.append(("No" if row1["license"] == "No" else "Yes") == row2["pred_license"])
-        if (("No" if row1["license"] == "No" else "Yes") != row2["pred_license"]):
-            print(f"LICENSE acc. - {row2['pred_license']} (GT:{row1['license']})")
-print("\nSummary:")
-print(f"Dependencies acc. - {int(100 * np.mean(eval_dependencies))}%")
-print(f"Training acc. - {int(100 * np.mean(eval_training))}%")
-print(f"Evaluating acc. - {int(100 * np.mean(eval_evaluating))}%")
-print(f"Weights acc. - {int(100 * np.mean(eval_weights))}%")
-print(f"README acc. - {int(100 * np.mean(eval_readme))}%")
-print(f"LICENSE acc. - {int(100 * np.mean(eval_licensing))}%")

plotting/midl_summary.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import os
+import sys
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+import pandas as pd
+import numpy as np
+from core.paper import Paper
+def compare(ground_truth, automated_truth, key, verbose=False):
+    if key not in ground_truth.keys() or key not in automated_truth.keys():
+        return np.nan
+    if (pd.isna(ground_truth[key]) or pd.isna(automated_truth[key])):
+        return np.nan
+    if (key == "license"):
+        ground_truth[key] = "No" if ground_truth[key] == "No" else "Yes"
+    res = ground_truth[key] == automated_truth[key]
+    if verbose and res == False:
+        print(f"{key} acc. - {automated_truth[key]} (GT:{ground_truth[key]})")
+    return res
+max_workers = 6
+compare_to_gt = True
+verbose = True
+training = True
+paper_dump = pd.read_csv("data/results.csv", sep="\t")
+papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
+eval_readme = []
+eval_training = []
+eval_evaluating = []
+eval_licensing = []
+eval_weights = []
+eval_dependencies = []
+full_results = []
+for idx, paper in enumerate(papers):
+    if paper.venue != "MIDL" or paper.main_repo_url is None or (int(paper.year) >= 2024 if training else int(paper.year) < 2024):
+        continue
+    if (verbose):
+        print(f"\nEvaluating {idx} out of {len(papers)} papers...")
+        print(f'Paper title - "{paper.title}" ({paper.year})')
+        print(f'Repository link - {paper.main_repo_url}')
+    eval_dependencies.append(compare(paper.code_repro_manual, paper.code_repro_auto, "dependencies", verbose))
+    eval_training.append(compare(paper.code_repro_manual, paper.code_repro_auto, "training", verbose))
+    eval_evaluating.append(compare(paper.code_repro_manual, paper.code_repro_auto, "evaluation", verbose))
+    eval_weights.append(compare(paper.code_repro_manual, paper.code_repro_auto, "weights", verbose))
+    eval_readme.append(compare(paper.code_repro_manual, paper.code_repro_auto, "readme", verbose))
+    eval_licensing.append(compare(paper.code_repro_manual, paper.code_repro_auto, "license", verbose))
+print("\nSummary:")
+print(f"Dependencies acc. - {int(100 * np.nanmean(eval_dependencies))}%")
+print(f"Training acc. - {int(100 * np.nanmean(eval_training))}%")
+print(f"Evaluating acc. - {int(100 * np.nanmean(eval_evaluating))}%")
+print(f"Weights acc. - {int(100 * np.nanmean(eval_weights))}%")
+print(f"README acc. - {int(100 * np.nanmean(eval_readme))}%")
+print(f"LICENSE acc. - {int(100 * np.nanmean(eval_licensing))}%")

plotting/paper_plots.py CHANGED Viewed

@@ -1,41 +1,34 @@
 import plotly.express as px
 import numpy as np
-paper_dump = pd.read_csv('data/dump.csv', sep="\t")
-# Calculate total number of URLs per year and venue
-custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
-total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
-# Calculate the number of URLs with errors per year and venue
-total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
-# Merge the DataFrames to calculate the error rate
-merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
-merged_df['repo_rate'] = merged_df['total_urls'] / merged_df['total_titles']
-# Plot the error rates using Plotly, with year on x-axis and color by venue
-fig = px.bar(
-    merged_df,
-    x='year',
-    y='total_titles',
-    color='venue',
-    barmode='group',
-    title=f'Number of papers per venue',
-    labels={'error_rate': 'Success Rate', 'year': 'Year'},
-    category_orders={'venue': custom_order}
-)
-fig.update_xaxes(range=[2018, 2024])
-fig.show()
 import plotly.express as px
 import numpy as np
 # Calculate total number of URLs per year and venue
 total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
 # Calculate the number of URLs with errors per year and venue
-total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
 # Merge the DataFrames to calculate the error rate
 merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
@@ -50,25 +43,10 @@ fig = px.bar(
     barmode='group',
     title=f'Number of papers per venue',
     labels={'error_rate': 'Success Rate', 'year': 'Year'},
-    category_orders={'venue': custom_order}
-)
-fig.update_xaxes(range=[2018, 2024])
-fig.show()
-# Plot the error rates using Plotly, with year on x-axis and color by venue
-fig = px.bar(
-    merged_df,
-    x='year',
-    y='total_urls',
-    color='venue',
-    barmode='group',
-    title=f'Number of papers per venue',
-    labels={'error_rate': 'Success Rate', 'year': 'Year'},
-    category_orders={'venue': custom_order}
 )
-fig.update_xaxes(range=[2018, 2024])
 fig.show()
@@ -81,9 +59,9 @@ fig = px.bar(
     barmode='group',
     title=f'Number of repositories per venue',
     labels={'error_rate': 'Success Rate', 'year': 'Year'},
-    category_orders={'venue': custom_order}
 )
-fig.update_xaxes(range=[2018, 2024])
 fig.update_yaxes(range=[0, 1])
 fig.show()

 import plotly.express as px
+import os
+import sys
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
 import numpy as np
+import pandas as pd
 import plotly.express as px
 import numpy as np
+from config.constants import VENUE_ORDER
+import json
+from core.paper import _parse_url_field
+paper_dump = pd.read_csv('data/urls.csv', sep="\t")
+paper_dump['urls_manual'] = paper_dump['urls_manual'].apply(
+    lambda x: _parse_url_field(json.loads(x))
+)
+paper_dump['urls_auto'] = paper_dump['urls_auto'].apply(
+    lambda x: _parse_url_field(json.loads(x))
+)
+paper_dump['url'] = paper_dump.apply(
+    lambda row: next((u for u in [*row['urls_manual'], *row['urls_auto']] if "github.com" in u), None),
+    axis=1
+)
 # Calculate total number of URLs per year and venue
 total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
 # Calculate the number of URLs with errors per year and venue
+total_url_per_venue = paper_dump[
+    paper_dump["url"].notna() & (paper_dump["url"] != "")
+].groupby(['year', 'venue']).size().reset_index(name='total_urls')
 # Merge the DataFrames to calculate the error rate
 merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
     barmode='group',
     title=f'Number of papers per venue',
     labels={'error_rate': 'Success Rate', 'year': 'Year'},
+    category_orders={'venue': VENUE_ORDER}
 )
+fig.update_xaxes(range=[2018, 2025])
 fig.show()
     barmode='group',
     title=f'Number of repositories per venue',
     labels={'error_rate': 'Success Rate', 'year': 'Year'},
+    category_orders={'venue': VENUE_ORDER}
 )
+fig.update_xaxes(range=[2018, 2025])
 fig.update_yaxes(range=[0, 1])
 fig.show()

plotting/print_incorrect.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import plotly.express as px
+import pandas as pd
+import re
+# Define columns for all relevant predictions
+pred_columns = ['pred_dependencies', 'pred_training',
+                'pred_evaluation', 'pred_weights', 'pred_readme',
+                'pred_license']
+# Define the real and predicted column pairs
+real_pred_columns = {
+    'dependencies': 'pred_dependencies',
+    'training': 'pred_training',
+    'evaluation': 'pred_evaluation',
+    'weights': 'pred_weights',
+    'readme': 'pred_readme',
+    'license': 'pred_license'
+}
+df = pd.read_csv('data/results.csv', sep="\t")
+# Cleanup
+df['year'] = pd.to_numeric(df['year'], errors='coerce')
+df = df.dropna(subset=['year'])
+df['year'] = df['year'].astype(int)
+custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
+# Group by venue
+df_filtered = df[df['pred_live'] == "Yes"].copy()
+df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
+# Add matching counts for each category
+for real, pred in real_pred_columns.items():
+    df_filtered[f'matching_{real}'] = df_filtered[real] == df_filtered[pred]
+for real, pred in real_pred_columns.items():
+    print(f"Evaluations for {real}:")
+    for idx, row in df_filtered.iterrows():
+        if ((row['year'] == 2024) | pd.isna(row["url"]) | (row["url"] == "") | (pd.isna(row[real]))):
+            continue
+        if not(row[f'matching_{real}']):
+            print(f"Automated test for {real} failed for link: {row['url']} [{row[real]} - {row[pred]}]")

plotting/result_plots.py CHANGED Viewed

@@ -1,25 +1,66 @@
 import plotly.express as px
 import pandas as pd
 df = pd.read_csv('data/results.csv', sep="\t")
 custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
-# Calculate total number of URLs per year and venue
-total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls')
-# Calculate the number of URLs with errors per year and venue
-errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')
-# Merge the DataFrames to calculate the error rate
-error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
-error_rate_df['errors'] = error_rate_df['errors'].fillna(0)  # Replace NaN with 0 for venues with no errors
-error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
 # Plot the error rates using Plotly, with year on x-axis and color by venue
 fig = px.bar(
-    error_rate_df,
     x='year',
-    y='error_rate',
     color='venue',
     barmode='group',
     title=f'Success Rate per Venue and Year for "valid_url"',
@@ -32,48 +73,53 @@ fig.update_xaxes(range=[2017.5, 2024.5])
 fig.show()
-for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
-    # Calculate total number of URLs per year and venue
-    total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls')
-    # Calculate the number of URLs with errors per year and venue
-    passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')
-    # Merge the DataFrames to calculate the error rate
-    success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left')
-    success_rate_df['successes'] = success_rate_df['successes'].fillna(0)  # Replace NaN with 0 for venues with no errors
-    success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']
-    # Plot the error rates using Plotly, with year on x-axis and color by venue
-    fig = px.bar(
-        success_rate_df,
-        x='year',
-        y='success_rate',
-        color='venue',
-        barmode='group',
-        title=f'Success Rate per Venue and Year for "{topic}"',
-        labels={'error_rate': 'Success Rate', 'year': 'Year'},
-        category_orders={'venue': custom_order}
-    )
-    fig.update_yaxes(range=[0, 1])
-    fig.update_xaxes(range=[2017.5, 2024.5])
-    fig.show()
 # List of columns to check for "No"
-columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
 # Step 1: Calculate the number of "No" answers per row for the specified columns
-df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
 # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
 fig = px.scatter(
-    df,
     x='pred_citations',
     y='no_count',
     color='venue',
-    title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue',
     labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
     category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary
     log_x=True
@@ -82,19 +128,16 @@ fig = px.scatter(
 # Step 3: Display the scatter plot
 fig.show()
-# List of columns to check for "No"
-columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
-# Step 1: Calculate the number of "No" answers per row for the specified columns
-df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
 # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
 fig = px.strip(
-    df,
     x='venue',
     y='no_count',
     color='venue',
-    title='Individual "No" Scores with Jitter per Venue',
     labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
     category_orders={'venue': custom_order},  # Ensure custom order for venues
     stripmode='overlay'  # Allows all individual points to overlay each other
@@ -105,7 +148,7 @@ fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))
 # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
 fig.add_trace(px.box(
-    df,
     x='venue',
     y='no_count',
     category_orders={'venue': custom_order}
@@ -114,28 +157,38 @@ fig.add_trace(px.box(
 # Step 5: Show the plot
 fig.show()
-for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
-    # Calculate total number of URLs per venue
-    total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls')
-    # Calculate the number of URLs with errors per venue
-    errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors')
-    # Merge the DataFrames to calculate the error rate
-    error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left')
-    error_rate_df['errors'] = error_rate_df['errors'].fillna(0)  # Replace NaN with 0 for venues with no errors
-    error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
-    # Plot the error rates using Plotly, with venue on x-axis
-    fig = px.bar(
-        error_rate_df,
-        x='venue',
-        y='error_rate',
-        color='venue',
-        title=f'Success Rate per Venue for "{topic}"',
-        labels={'error_rate': 'Success Rate', 'venue': 'Venue'},
-        category_orders={'venue': custom_order}
-    )
-    fig.update_yaxes(range=[0, 1])
-    fig.show()

 import plotly.express as px
 import pandas as pd
+import re
+# Define columns for all relevant predictions
+pred_columns = ['pred_dependencies', 'pred_training',
+                'pred_evaluation', 'pred_weights', 'pred_readme',
+                'pred_license']
+# Define the real and predicted column pairs
+real_pred_columns = {
+    'dependencies': 'pred_dependencies',
+    'training': 'pred_training',
+    'evaluation': 'pred_evaluation',
+    'weights': 'pred_weights',
+    'readme': 'pred_readme',
+    'license': 'pred_license'
+}
 df = pd.read_csv('data/results.csv', sep="\t")
+# Cleanup
+df['year'] = pd.to_numeric(df['year'], errors='coerce')
+df = df.dropna(subset=['year'])
+df['year'] = df['year'].astype(int)
+# df['venue'] = df['venue'].apply(lambda x: str(re.search(r"'(.*?)'", x).group(1)))
 custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
+# Group by year and venue, and calculate the ratio of papers where URL is not None
+df_grouped = df.groupby(['year', 'venue']).agg(
+    total_papers=('title', 'count'),
+    papers_with_url=('url', lambda x: x.notna().sum()),
+    valid_urls=('pred_live', lambda x: (x == "Yes").sum())
+).reset_index()
+df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']
+# Create the plotly figure
+fig = px.bar(
+    df_grouped,
+    x='year',
+    y='ratio',
+    color='venue',
+    barmode='group',
+    title=f'Success Rate per Venue and Year for "valid_url"',
+    labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},
+    category_orders={'venue': custom_order}
+)
+fig.update_yaxes(range=[0, 1])
+fig.update_xaxes(range=[2017.5, 2024.5])
+fig.show()
+df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']
 # Plot the error rates using Plotly, with year on x-axis and color by venue
 fig = px.bar(
+    df_grouped,
     x='year',
+    y='valid_ratio',
     color='venue',
     barmode='group',
     title=f'Success Rate per Venue and Year for "valid_url"',
 fig.show()
+# Ensure boolean columns are actually booleans
+df_new = df.copy()
+for col in pred_columns:
+    df_new[col] = df_new[col] == "Yes"
+df_grouped = df_new.groupby('venue').agg(
+    valid_urls=('pred_live', lambda x: (x == "Yes").sum()),
+    **{col: (col, lambda x: x[df_new['pred_live'] == "Yes"].sum()) for col in pred_columns}
+).reset_index()
+# Calculate the ratio for each prediction column
+for col in pred_columns:
+    df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']
+# Melt the dataframe for easier plotting
+df_melted = df_grouped.melt(id_vars=['venue'],
+                             value_vars=pred_columns,
+                             var_name='Prediction Type',
+                             value_name='Ratio')
+# Create a grouped bar plot
+fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',
+             barmode='group',  # Ensures bars are side by side
+             category_orders={'venue': custom_order},
+             title='Ratio of Predictions by Venue')
+# Show the figure
+fig.update_yaxes(range=[0, 1])
+fig.show()
 # List of columns to check for "No"
+# Step 1: Filter only rows where pred_live is "Yes"
+df_filtered = df[df['pred_live'] == "Yes"].copy()
+for col in pred_columns:
+    df_filtered[col] = df_filtered[col] == "Yes"
 # Step 1: Calculate the number of "No" answers per row for the specified columns
+df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)
 # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
 fig = px.scatter(
+    df_filtered,
     x='pred_citations',
     y='no_count',
     color='venue',
+    title='Number of passed tests, Color Coded by Venue',
     labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
     category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary
     log_x=True
 # Step 3: Display the scatter plot
 fig.show()
+# [np.corrcoef(np.array(df_filtered[col][~(pd.isna(df_filtered['pred_citations']))], dtype=int), df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])[0, 1] for col in pred_columns]
+# np.corrcoef(np.array(df_filtered['no_count'][~(pd.isna(df_filtered['pred_citations']))]), (1 + np.array(df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])))
 # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
 fig = px.strip(
+    df_filtered,
     x='venue',
     y='no_count',
     color='venue',
+    title='Automated Reproducibility Score per Venue',
     labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
     category_orders={'venue': custom_order},  # Ensure custom order for venues
     stripmode='overlay'  # Allows all individual points to overlay each other
 # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
 fig.add_trace(px.box(
+    df_filtered,
     x='venue',
     y='no_count',
     category_orders={'venue': custom_order}
 # Step 5: Show the plot
 fig.show()
+# Group by venue
+df_filtered = df[df['pred_live'] == "Yes"].copy()
+df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
+df_grouped = df_filtered.groupby('venue').agg(
+    total_papers=('title', 'count')
+).reset_index()
+# Add matching counts for each category
+for real, pred in real_pred_columns.items():
+    df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)
+# Compute the ratio for each category
+for real in real_pred_columns.keys():
+    df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']
+# Melt the dataframe for visualization
+df_melted = df_grouped.melt(id_vars=['venue'],
+                             value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()],
+                             var_name='Category',
+                             value_name='Ratio')
+# Clean up category names
+df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()
+# Create the bar plot
+fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',
+             barmode='group',
+             title='Ratio of Matching Real vs Predicted Categories by Venue',
+             labels={'Ratio': 'Ratio of Matches'})
+# Ensure y-axis range is between 0 and 1
+fig.update_yaxes(range=[0, 1])
+# Show the figure
+fig.show()

plotting/results.ipynb ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import plotly.express as px\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "\n",
+    "# Define columns for all relevant predictions\n",
+    "pred_columns = ['pred_dependencies', 'pred_training', \n",
+    "                'pred_evaluation', 'pred_weights', 'pred_readme', \n",
+    "                'pred_license']\n",
+    "\n",
+    "# Define the real and predicted column pairs\n",
+    "real_pred_columns = {\n",
+    "    'dependencies': 'pred_dependencies',\n",
+    "    'training': 'pred_training',\n",
+    "    'evaluation': 'pred_evaluation',\n",
+    "    'weights': 'pred_weights',\n",
+    "    'readme': 'pred_readme',\n",
+    "    'license': 'pred_license'\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "df = pd.read_csv('data/results.csv', sep=\"\\t\")\n",
+    "\n",
+    "# Cleanup\n",
+    "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n",
+    "df = df.dropna(subset=['year'])\n",
+    "df['year'] = df['year'].astype(int)\n",
+    "\n",
+    "df['venue'] = df['venue'].apply(lambda x: str(re.search(r\"'(.*?)'\", x).group(1)))\n",
+    "\n",
+    "custom_order = [\"MICCAI\", \"MIDL\", \"Nature\", \"arXiv\"]\n",
+    "\n",
+    "# Group by year and venue, and calculate the ratio of papers where URL is not None\n",
+    "df_grouped = df.groupby(['year', 'venue']).agg(\n",
+    "    total_papers=('title', 'count'),\n",
+    "    papers_with_url=('url', lambda x: x.notna().sum()),\n",
+    "    valid_urls=('pred_live', lambda x: (x == \"Yes\").sum())\n",
+    ").reset_index()\n",
+    "\n",
+    "df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']\n",
+    "\n",
+    "# Create the plotly figure\n",
+    "fig = px.bar(\n",
+    "    df_grouped,\n",
+    "    x='year',\n",
+    "    y='ratio',\n",
+    "    color='venue',\n",
+    "    barmode='group',\n",
+    "    title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
+    "    labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},\n",
+    "    category_orders={'venue': custom_order}\n",
+    ")\n",
+    "\n",
+    "fig.update_yaxes(range=[0, 1])\n",
+    "fig.update_xaxes(range=[2017.5, 2024.5])\n",
+    "fig.show()\n",
+    "\n",
+    "df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']\n",
+    "\n",
+    "\n",
+    "# Plot the error rates using Plotly, with year on x-axis and color by venue\n",
+    "fig = px.bar(\n",
+    "    df_grouped,\n",
+    "    x='year',\n",
+    "    y='valid_ratio',\n",
+    "    color='venue',\n",
+    "    barmode='group',\n",
+    "    title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
+    "    labels={'error_rate': 'Success Rate', 'year': 'Year'},\n",
+    "    category_orders={'venue': custom_order}\n",
+    ")\n",
+    "\n",
+    "fig.update_yaxes(range=[0, 1])\n",
+    "fig.update_xaxes(range=[2017.5, 2024.5])\n",
+    "fig.show()\n",
+    "\n",
+    "\n",
+    "# Ensure boolean columns are actually booleans\n",
+    "df_new = df.copy()\n",
+    "for col in pred_columns:\n",
+    "    df_new[col] = df_new[col] == \"Yes\"\n",
+    "\n",
+    "df_grouped = df_new.groupby('venue').agg(\n",
+    "    valid_urls=('pred_live', lambda x: (x == \"Yes\").sum()),\n",
+    "    **{col: (col, lambda x: x[df_new['pred_live'] == \"Yes\"].sum()) for col in pred_columns}  \n",
+    ").reset_index()\n",
+    "\n",
+    "\n",
+    "# Calculate the ratio for each prediction column\n",
+    "for col in pred_columns:\n",
+    "    df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']\n",
+    "\n",
+    "# Melt the dataframe for easier plotting\n",
+    "df_melted = df_grouped.melt(id_vars=['venue'], \n",
+    "                             value_vars=pred_columns, \n",
+    "                             var_name='Prediction Type', \n",
+    "                             value_name='Ratio')\n",
+    "\n",
+    "# Create a grouped bar plot\n",
+    "fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',\n",
+    "             barmode='group',  # Ensures bars are side by side\n",
+    "             category_orders={'venue': custom_order},\n",
+    "             title='Ratio of Predictions by Venue')\n",
+    "\n",
+    "# Show the figure\n",
+    "fig.update_yaxes(range=[0, 1])\n",
+    "fig.show()\n",
+    "\n",
+    "# List of columns to check for \"No\"\n",
+    "# Step 1: Filter only rows where pred_live is \"Yes\"\n",
+    "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
+    "for col in pred_columns:\n",
+    "    df_filtered[col] = df_filtered[col] == \"Yes\"\n",
+    "\n",
+    "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
+    "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
+    "\n",
+    "# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue\n",
+    "fig = px.scatter(\n",
+    "    df_filtered,\n",
+    "    x='pred_citations',\n",
+    "    y='no_count',\n",
+    "    color='venue',\n",
+    "    title='Number of passed tests, Color Coded by Venue',\n",
+    "    labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},\n",
+    "    category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary\n",
+    "    log_x=True\n",
+    ")\n",
+    "\n",
+    "# Step 3: Display the scatter plot\n",
+    "fig.show()\n",
+    "\n",
+    "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
+    "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
+    "\n",
+    "# Step 2: Create a strip plot (scatter-like) with jitter to show individual \"No\" counts\n",
+    "fig = px.strip(\n",
+    "    df_filtered,\n",
+    "    x='venue',\n",
+    "    y='no_count',\n",
+    "    color='venue',\n",
+    "    title='Automated Reproducibility Score per Venue',\n",
+    "    labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},\n",
+    "    category_orders={'venue': custom_order},  # Ensure custom order for venues\n",
+    "    stripmode='overlay'  # Allows all individual points to overlay each other\n",
+    ")\n",
+    "\n",
+    "# Step 3: Add some jitter to the x-axis so points don't overlap\n",
+    "fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))\n",
+    "\n",
+    "# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread\n",
+    "fig.add_trace(px.box(\n",
+    "    df_filtered,\n",
+    "    x='venue',\n",
+    "    y='no_count',\n",
+    "    category_orders={'venue': custom_order}\n",
+    ").data[0])  # We add the first trace of the box plot to overlay\n",
+    "\n",
+    "# Step 5: Show the plot\n",
+    "fig.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Group by venue\n",
+    "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
+    "df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == \"No\") | (pd.isna(row))) else \"Yes\")\n",
+    "df_grouped = df_filtered.groupby('venue').agg(\n",
+    "    total_papers=('title', 'count')\n",
+    ").reset_index()\n",
+    "\n",
+    "# Add matching counts for each category\n",
+    "for real, pred in real_pred_columns.items():\n",
+    "    df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)\n",
+    "\n",
+    "# Compute the ratio for each category\n",
+    "for real in real_pred_columns.keys():\n",
+    "    df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']\n",
+    "\n",
+    "# Melt the dataframe for visualization\n",
+    "df_melted = df_grouped.melt(id_vars=['venue'], \n",
+    "                             value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], \n",
+    "                             var_name='Category', \n",
+    "                             value_name='Ratio')\n",
+    "\n",
+    "# Clean up category names\n",
+    "df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()\n",
+    "\n",
+    "# Create the bar plot\n",
+    "fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',\n",
+    "             barmode='group',  \n",
+    "             title='Ratio of Matching Real vs Predicted Categories by Venue',\n",
+    "             labels={'Ratio': 'Ratio of Matches'})\n",
+    "\n",
+    "# Ensure y-axis range is between 0 and 1\n",
+    "fig.update_yaxes(range=[0, 1])\n",
+    "\n",
+    "# Show the figure\n",
+    "fig.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

plotting/urls.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+import numpy as np
+import ast
+import os
+import sys
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+from core.paper import Paper
+df = pd.read_csv('data/urls.csv', sep="\t")
+success = 0
+total = 0
+papers = [Paper.from_row(row) for _, row in df.iterrows()]
+def normalize_url(url):
+    return url.strip().lower().rstrip("/")
+tp, fp, fn = 0, 0, 0
+for paper in papers:
+    if (paper.venue == "MICCAI"):
+        continue
+    urls_auto = [normalize_url(u) for u in paper.urls_auto]
+    urls_manual = [normalize_url(u) for u in paper.urls_manual]
+    auto_set = set(urls_auto)
+    manual_set = set(urls_manual)
+    tp += len(auto_set & manual_set)
+    fp += len(auto_set - manual_set)
+    fn += len(manual_set - auto_set)
+precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+print(f"Precision: {precision:.3f}")
+print(f"Recall: {recall:.3f}")