Attila Simkó commited on
Commit
2db37b1
·
1 Parent(s): 3cfadc8

big upgrade

Browse files
.gitignore CHANGED
@@ -1,10 +1,4 @@
1
- data/MIDL.csv
2
- data/MICCAI.csv
3
- data/arXiv.csv
4
- data/Nature.csv
5
- data/results.csv
6
- data/*.zip
7
- data/test/*
8
  *.env
9
  .env
10
  evaluations/__pycache__/*
 
1
+ data/
 
 
 
 
 
 
2
  *.env
3
  .env
4
  evaluations/__pycache__/*
app.py CHANGED
@@ -2,8 +2,11 @@ import streamlit as st
2
  from evaluations.repo_evaluations import evaluate
3
  from evaluations.models import RemoteLLM
4
  import requests
 
 
 
5
 
6
- model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
7
 
8
  st.write("\n")
9
  st.write("Welcome to the online reproducibility evaluation tool!")
@@ -12,10 +15,11 @@ st.write("Additionally we look for common pitfalls in the code according to a pu
12
 
13
  checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
14
  repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
15
-
16
  if (repo_link):
17
  verbose = 4 if checkbox else 3
18
- evaluate(llm=model, verbose=verbose, repo_url=repo_link)
 
 
19
 
20
  with st.form("my_form"):
21
  st.write("Notice something wrong? Please tell us so we can improve.")
 
2
  from evaluations.repo_evaluations import evaluate
3
  from evaluations.models import RemoteLLM
4
  import requests
5
+ from core.paper import Paper
6
+ from core.conversion import fetch_repo
7
+ import os
8
 
9
+ model = None # RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
10
 
11
  st.write("\n")
12
  st.write("Welcome to the online reproducibility evaluation tool!")
 
15
 
16
  checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
17
  repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
 
18
  if (repo_link):
19
  verbose = 4 if checkbox else 3
20
+ paper = Paper.from_url(repo_link, verbose=verbose)
21
+ fetch_repo(0, paper.main_repo_url, paper.zip_path, os.getenv("githubToken"))
22
+ evaluate(model, paper, paper.log)
23
 
24
  with st.form("my_form"):
25
  st.write("Notice something wrong? Please tell us so we can improve.")
config/__pycache__/constants.cpython-312.pyc ADDED
Binary file (684 Bytes). View file
 
config/constants.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum, auto
2
+
3
+ VENUE_ORDER = ["MICCAI", "MIDL", "Nature", "arXiv"]
4
+ MIDL_COLORS = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000", "#FF862C", "#800080"]
5
+
6
+ class LogType(Enum):
7
+ TITLE = "TITLE"
8
+ LOG = "LOG"
9
+ ERROR = "ERROR"
10
+ WARNING = "WARNING"
11
+ NOTE = "NOTE"
12
+ INFO = "INFO"
core/__pycache__/conversion.cpython-312.pyc ADDED
Binary file (10.1 kB). View file
 
core/__pycache__/paper.cpython-312.pyc ADDED
Binary file (8.64 kB). View file
 
core/conversion.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from urllib.request import urlretrieve
3
+ import requests
4
+ import random
5
+ import time
6
+ import pandas as pd
7
+ import xml.etree.ElementTree as ET
8
+ from tqdm import tqdm
9
+ import os
10
+ import requests
11
+ from tqdm import tqdm
12
+ import xml.etree.ElementTree as ET
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from sentence_transformers import SentenceTransformer
15
+ from transformers import AutoModelForCausalLM, AutoTokenizer
16
+ import torch
17
+ import numpy as np
18
+ import faiss
19
+
20
+
21
+ token = os.getenv("githubToken")
22
+
23
+ def noop_logger(*args, **kwargs):
24
+ pass
25
+
26
+ def download_pdf(paper, max_retries=3):
27
+ if pd.isna(paper.pdf_url):
28
+ paper.log("ERROR", "Missing PDF URL")
29
+ return paper
30
+
31
+ pdf_path = paper.pdf_path
32
+ if (os.path.exists(pdf_path)):
33
+ return paper
34
+
35
+ headers = {'User-Agent': 'Mozilla/5.0'}
36
+
37
+ for attempt in range(max_retries):
38
+ try:
39
+ response = requests.get(paper.pdf_url, headers=headers)
40
+ if response.status_code == 200:
41
+ with open(pdf_path, "wb") as f:
42
+ f.write(response.content)
43
+
44
+ time.sleep(random.uniform(1.0, 3.0))
45
+ return paper
46
+ elif response.status_code == 429:
47
+ wait = 2 ** attempt
48
+ paper.log("WARNING", f"Rate limited, retrying in {wait}s...")
49
+ time.sleep(wait)
50
+ else:
51
+ paper.log("ERROR", f"Download failed: HTTP {response.status_code}")
52
+ break
53
+ except Exception as e:
54
+ paper.log("ERROR", f"Download error: {e}")
55
+ time.sleep(1)
56
+
57
+ return paper
58
+
59
+ def get_api_link(url):
60
+ username, repo_name = decompose_url(url)
61
+ if (username == None):
62
+ return ""
63
+ return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
64
+
65
+ def decompose_url(url):
66
+ try:
67
+ url = url.split("github.com")[1]
68
+ url = url.strip(".")
69
+ url = url.split(".git")[0]
70
+ url = url.strip("/")
71
+ parts = url.split("/")
72
+ username = parts[0]
73
+ repo_name = parts[1]
74
+ return username, repo_name
75
+ except:
76
+ return None, None
77
+
78
+ def fetch_repo(verbose, repo_url, repo_name, token, force_download=False):
79
+ if (os.path.exists(repo_name)):
80
+ if (force_download):
81
+ os.remove(repo_name)
82
+ else:
83
+ return
84
+
85
+ if ("github.com" not in repo_url):
86
+ return ValueError(f"URL not for github repo, please evaluate manually ({repo_url}).")
87
+
88
+ headers = {"Authorization": f"token {token}"}
89
+ api_url = get_api_link(repo_url)
90
+
91
+ if (api_url == ""):
92
+ return ValueError(f"Failed to parse the URL, please evaluate manually ({repo_url}).")
93
+
94
+ # Sending GET request to GitHub API
95
+ response = requests.get(api_url, headers=headers)
96
+
97
+ if response.status_code == 200:
98
+ with open(repo_name, 'wb') as file:
99
+ file.write(response.content)
100
+ if (response.status_code == 404):
101
+ return ValueError("Repository private / Link broken.")
102
+
103
+ def download_repo(paper):
104
+ try:
105
+ if (paper.main_repo_url is None):
106
+ return
107
+
108
+ fetch_repo(0, paper.main_repo_url, paper.zip_path, token)
109
+ except Exception as e:
110
+ paper.log("ERROR", f"Repo download failed: {e}")
111
+ return paper
112
+
113
+
114
+ def pdf_to_grobid(filename, save_path=None, grobid_url="https://attilasimko-grobid.hf.space/"):
115
+ """
116
+ Convert a PDF to Grobid XML.
117
+
118
+ Parameters:
119
+ filename (str or list): Path to the PDF file or list of PDF files.
120
+ save_path (str, optional): Directory or file path to save to. Defaults to the current directory.
121
+ grobid_url (str, optional): URL of the Grobid server. Defaults to public server.
122
+
123
+ Returns:
124
+ str or list: Path(s) to the saved XML file(s) or parsed XML object if saved to a temp file.
125
+ """
126
+
127
+ def is_server_up(url):
128
+ try:
129
+ response = requests.get(url + "/api/health", timeout=5)
130
+ return response.status_code == 200
131
+ except requests.RequestException:
132
+ return False
133
+
134
+ if not is_server_up(grobid_url):
135
+ raise ConnectionError(f"The Grobid server {grobid_url} is not available.")
136
+
137
+ # Handle multiple files
138
+ if isinstance(filename, list):
139
+ if save_path is None or not os.path.isdir(save_path):
140
+ print(f"Warning: {save_path} is not a directory. PDFs will be saved in the current directory: {os.getcwd()}")
141
+ save_path = "."
142
+
143
+ xmls = []
144
+ for pdf in tqdm(filename, desc="Processing PDFs"):
145
+ try:
146
+ xml = pdf_to_grobid(pdf, save_path, grobid_url)
147
+ xmls.append(xml)
148
+ except Exception as e:
149
+ print(f"Error processing {pdf}: {e}")
150
+ xmls.append(None)
151
+
152
+ return xmls
153
+
154
+ # Handle directory input
155
+ if os.path.isdir(filename):
156
+ pdfs = [os.path.join(filename, f) for f in os.listdir(filename) if f.endswith(".pdf")]
157
+ if not pdfs:
158
+ print(f"Warning: No PDF files found in directory {filename}")
159
+ return pdf_to_grobid(pdfs, save_path, grobid_url)
160
+
161
+ # Ensure file exists
162
+ if not os.path.isfile(filename):
163
+ raise FileNotFoundError(f"The file {filename} does not exist.")
164
+
165
+ # Send PDF to Grobid
166
+ with open(filename, "rb") as file:
167
+ files = {"input": file}
168
+ post_url = f"{grobid_url}/api/processFulltextDocument"
169
+ response = requests.post(post_url, files=files)
170
+
171
+ if response.status_code != 200:
172
+ raise Exception(f"Error: {response.reason}")
173
+
174
+ # Determine save path
175
+ if save_path is None:
176
+ save_file = os.path.join(os.getcwd(), "temp_grobid.xml")
177
+ elif os.path.isdir(save_path):
178
+ base_name = os.path.splitext(os.path.basename(filename))[0] + ".xml"
179
+ save_file = os.path.join(save_path, base_name)
180
+ else:
181
+ save_file = save_path if save_path.endswith(".xml") else save_path + ".xml"
182
+
183
+ # Save the response
184
+ with open(save_file, "wb") as f:
185
+ f.write(response.content)
186
+
187
+ # Return XML object if saved to temp file
188
+ if save_path is None:
189
+ return ET.parse(save_file).getroot()
190
+ else:
191
+ return save_file
192
+
193
+ def extract_body(xml_root):
194
+ """Extracts and returns the text content of the paper's body from Grobid XML."""
195
+ namespace = {"tei": "http://www.tei-c.org/ns/1.0"} # Define TEI namespace
196
+ body_text = []
197
+
198
+ # Locate <body> in the XML structure
199
+ body = xml_root.find(".//tei:body", namespace)
200
+ if body is not None:
201
+ for p in body.findall(".//tei:p", namespace): # Get all paragraphs inside <body>
202
+ if p.text:
203
+ body_text.append(p.text.strip())
204
+
205
+ return "\n".join(body_text)
core/paper.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/paper.py
2
+
3
+ import os
4
+ import uuid
5
+ import re
6
+ import fitz # PyMuPDF
7
+ import pdfplumber
8
+ import hashlib
9
+ import json
10
+ import pandas as pd
11
+ from config.constants import LogType
12
+ import ast
13
+ import streamlit as st
14
+ import datetime
15
+ from enum import Enum
16
+
17
+ def _parse_url_field(value):
18
+ if isinstance(value, list):
19
+ return value
20
+ if isinstance(value, str):
21
+ try:
22
+ parsed = ast.literal_eval(value)
23
+ return parsed if isinstance(parsed, list) else [parsed]
24
+ except Exception:
25
+ return [value]
26
+ if pd.isna(value):
27
+ return []
28
+ return [value]
29
+
30
+ class Paper:
31
+ def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], verbose=0):
32
+ # Metadata
33
+ self.title = title
34
+ self.venue = venue
35
+ self.year = year
36
+ self.pdf_url = pdf_url
37
+
38
+
39
+ # Optional ground truth links (e.g., from curated metadata)
40
+
41
+ self.urls_manual = _parse_url_field(urls_manual)
42
+ self.urls_auto = _parse_url_field(urls_auto)
43
+
44
+ self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id
45
+ self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf"
46
+ self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml"
47
+ self.zip_path = None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip"
48
+
49
+ # Internal state
50
+ self.logs = logs
51
+ self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual
52
+ self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto
53
+ self.verbose = verbose
54
+
55
+ def __repr__(self):
56
+ return f"<Paper: {self.title}>"
57
+
58
+ @classmethod
59
+ def from_url(cls, code_url, verbose):
60
+ # Supports both dicts and pandas Series
61
+ return cls(
62
+ urls_manual=code_url,
63
+ verbose=verbose
64
+
65
+ )
66
+
67
+ @classmethod
68
+ def from_raw(cls, row):
69
+ # Supports both dicts and pandas Series
70
+ return cls(
71
+ title=row.get("Title", ""),
72
+ venue=row.get("Venue", ""),
73
+ year=row.get("Year", ""),
74
+ pdf_url=row.get('PDF'),
75
+ urls_manual=row.get("Repository"),
76
+ code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")}
77
+ )
78
+
79
+ @classmethod
80
+ def from_row(cls, row):
81
+ # Supports both dicts and pandas Series
82
+ return cls(
83
+ title=row.get("title", ""),
84
+ venue=row.get("venue", ""),
85
+ year=row.get("year", ""),
86
+ paper_id=row.get('paper_id'),
87
+ pdf_url=row.get('pdf_url'),
88
+ urls_manual=json.loads(row.get("urls_manual")),
89
+ urls_auto=json.loads(row.get("urls_auto")),
90
+ code_repro_manual=json.loads(row.get("code_reproducibility_manual")),
91
+ code_repro_auto=json.loads(row.get("code_reproducibility_auto")),
92
+ logs=json.loads(row.get("logs", "[]"))
93
+ )
94
+
95
+ @property
96
+ def main_repo_url(self):
97
+ urls = [*self.urls_manual, *self.urls_auto]
98
+ github_links = [u for u in urls if "github.com" in u]
99
+ return github_links[0] if github_links else None
100
+
101
+ def _compute_id(self):
102
+ paper_name = self.title
103
+ if (not(pd.isna(self.pdf_url))):
104
+ paper_name += self.pdf_url
105
+
106
+ h = hashlib.sha256()
107
+ h.update(paper_name.encode("utf-8"))
108
+ return h.hexdigest()[:16]
109
+
110
+ def log(self, level, message):
111
+ self.logs.append({
112
+ "timestamp": datetime.datetime.utcnow().isoformat(),
113
+ "level": LogType[level.upper()], # "ERROR", "WARNING", "NOTE", etc.
114
+ "message": message
115
+ })
116
+
117
+ if (self.verbose == 0):
118
+ return
119
+
120
+ show_tips = (self.verbose == 2) | (self.verbose == 4)
121
+ if ((self.verbose == 1) | (self.verbose == 2)):
122
+ show = print
123
+ if ((self.verbose == 3) | (self.verbose == 4)):
124
+ show = st.write
125
+
126
+ # Align line-break
127
+ if (log_text.startswith("\n")):
128
+ show("\n")
129
+ log_text = log_text.lstrip('\n')
130
+
131
+ # Only show tips in verbose mode 2 and 4
132
+ if ((level == "TITLE") & show_tips):
133
+ show(f"\n#### {log_text}")
134
+ if ((level == "TIP") & show_tips):
135
+ show(f"*{log_text}*")
136
+ if ((level == "LOG") & show_tips):
137
+ show(f"{log_text}")
138
+ if ((level == "ERROR")):
139
+ show(f"**{log_text}**")
140
+
141
+ if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")):
142
+ raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
143
+
144
+ def to_dict(self):
145
+ return {
146
+ "title": self.title,
147
+ "venue": self.venue,
148
+ "year": self.year,
149
+ "pdf_url": self.pdf_url,
150
+ "paper_id": self.paper_id,
151
+ "urls_auto": json.dumps(self.urls_auto),
152
+ "urls_manual": json.dumps(self.urls_manual),
153
+ "logs": json.dumps([
154
+ {"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]}
155
+ for log in self.logs
156
+ ]),
157
+ "code_reproducibility_manual": json.dumps(self.code_repro_manual),
158
+ "code_reproducibility_auto": json.dumps(self.code_repro_auto),
159
+
160
+ }
data/dump.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/fetch_miccai.py DELETED
@@ -1,60 +0,0 @@
1
- import pandas as pd
2
- import requests
3
- import re
4
- from multiprocessing import Pool, cpu_count
5
- from functools import partial
6
-
7
- # Function to process each URL
8
- def process_paper(year, url):
9
- try:
10
- paper_page = requests.get(url).text
11
-
12
- # Find title
13
- title_pattern = r'<title>(.*?)\s*</title>'
14
- title_match = re.search(title_pattern, paper_page, re.DOTALL)
15
- title = title_match.group(1)
16
-
17
- # Find the code repository link
18
- code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
19
- code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
20
- code_repo_link = code_repo_match.group(1) if code_repo_match else ""
21
-
22
- # Find the dataset information
23
- dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
24
- dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
25
- dataset_info = "Yes" if dataset_match else "No"
26
-
27
- # Return a dictionary of the results
28
- return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
29
-
30
- except Exception as e:
31
- print(f"Error processing {url}: {e}")
32
- return None
33
-
34
- current_year = 2024 # Update with the current year
35
- MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
36
- MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
37
- years = [2021, 2022, 2023]
38
- # Set debug mode
39
- debug = False
40
-
41
- # Fetch all URLs for each year
42
- all_year_urls = []
43
- for i in range(len(MICCAI_pages)):
44
- year_page = requests.get(MICCAI_pages[i]).text
45
- print(year_page)
46
- urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
47
- all_year_urls.extend([(years[i], url) for url in urls])
48
-
49
- print(all_year_urls)
50
- # Parallel processing using Pool
51
- # if __name__ == "__main__":
52
- # with Pool(processes=12) as pool: # Use 12 processes
53
- # results = pool.starmap(process_paper, all_year_urls)
54
-
55
- # # Filter out any None results due to errors
56
- # results = [result for result in results if result is not None]
57
-
58
- # miccai = pd.DataFrame(results)
59
- # # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
60
- # miccai.to_csv('miccai.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/fetch_processed.py DELETED
@@ -1,31 +0,0 @@
1
- import csv
2
- import numpy as np
3
- import pandas as pd
4
- import re
5
-
6
- current_year = 2024
7
- MIDL_years = range(2018, current_year + 1, 1)
8
- custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
9
-
10
- for venue in custom_order:
11
- df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
12
- df = df.replace('\t', ' ', regex=True)
13
- df.to_csv(f'data/{venue}.csv', sep="\t")
14
-
15
- # Store all evaluations here
16
- paper_dump = pd.DataFrame()
17
- # Official color codes for conferences
18
- MIDL_colors = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000"]
19
-
20
- for venue in custom_order:
21
- with open(f'data/{venue}.csv') as file:
22
- tsv_file = csv.reader(file, delimiter="\t")
23
- for row in tsv_file:
24
- if (row[0] == ""):
25
- continue
26
-
27
- if (row[1] == ""):
28
- continue
29
-
30
- paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]]})], ignore_index=True)
31
- paper_dump.to_csv(f'data/dump.csv', sep="\t")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/fetch_zips.py DELETED
@@ -1,43 +0,0 @@
1
- import csv
2
- import numpy as np
3
- import sys
4
- import pandas as pd
5
- import re
6
- sys.path.append("./")
7
- from evaluations.utils import *
8
-
9
- token = os.getenv("githubToken")
10
- custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
11
-
12
- for venue in custom_order:
13
- df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
14
- df = df.replace('\t', ' ', regex=True)
15
- df.to_csv(f'data/{venue}.csv', sep="\t")
16
-
17
- # Store all evaluations here
18
- paper_dump = pd.DataFrame()
19
- # Official color codes for conferences
20
- zip_idx = 0
21
-
22
- for venue in custom_order:
23
- with open(f'data/{venue}.csv') as file:
24
- tsv_file = csv.reader(file, delimiter="\t")
25
- for row in tsv_file:
26
- if (row[0] == ""):
27
- continue
28
-
29
- if (row[1] == ""):
30
- continue
31
-
32
- repo_url = row[4]
33
- username, repo_name = decompose_url(repo_url)
34
- repo_save_name = f"repo_{zip_idx}.zip"
35
- repository_zip_name = f"data/test/{repo_save_name}"
36
- log(0, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
37
- fetch_repo(0, repo_url, repository_zip_name, token)
38
-
39
- if (os.path.exists(repository_zip_name)):
40
- paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]], "zip_idx": [ repository_zip_name ]})], ignore_index=True)
41
- zip_idx += 1
42
-
43
- paper_dump.to_csv(f'data/zipfiles.csv', sep="\t")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/zipfiles.csv DELETED
The diff for this file is too large to render. See raw diff
 
data_generation/fetch_processed.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
4
+ sys.path.append(ROOT_DIR)
5
+ import csv
6
+ import numpy as np
7
+ import pandas as pd
8
+ import re
9
+ import os
10
+ from core.paper import Paper
11
+
12
+ import sys
13
+ import os
14
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
15
+ sys.path.append(ROOT_DIR)
16
+ import csv
17
+ import numpy as np
18
+ import sys
19
+ import pandas as pd
20
+ import re
21
+ from evaluations.utils import *
22
+ import pandas as pd
23
+ from evaluations.url import fetch_url
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ import os
26
+ from core.paper import Paper
27
+ from core.conversion import download_repo
28
+ from tqdm import tqdm
29
+ from concurrent.futures import ProcessPoolExecutor
30
+
31
+ import sys
32
+ import os
33
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
34
+ sys.path.append(ROOT_DIR)
35
+ import csv
36
+ import numpy as np
37
+ import sys
38
+ import pandas as pd
39
+ import re
40
+ from evaluations.utils import *
41
+ import pandas as pd
42
+ from evaluations.url import fetch_url
43
+ from concurrent.futures import ThreadPoolExecutor
44
+ import os
45
+ from core.paper import Paper
46
+ from core.conversion import download_repo, pdf_to_grobid
47
+ from tqdm import tqdm
48
+ from concurrent.futures import ProcessPoolExecutor
49
+ from config.constants import VENUE_ORDER
50
+
51
+ import pandas as pd
52
+ import sys
53
+ import os
54
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
55
+ sys.path.append(ROOT_DIR)
56
+ from evaluations.url import fetch_url
57
+ from concurrent.futures import ThreadPoolExecutor
58
+ import os
59
+ from core.paper import Paper
60
+ from core.conversion import download_pdf
61
+ from tqdm import tqdm
62
+ from concurrent.futures import ProcessPoolExecutor
63
+
64
+ def get_urls_and_assign(paper):
65
+ try:
66
+ urls = fetch_url(paper.pdf_path)
67
+ paper.urls_auto = urls # Just update this
68
+ except Exception as e:
69
+ paper.log("ERROR", str(e))
70
+ return paper
71
+
72
+ def download_xml(paper):
73
+ try:
74
+ if (paper.pdf_path is None):
75
+ return paper
76
+
77
+ if (os.path.exists(paper.xml_path)):
78
+ paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.")
79
+ return paper
80
+
81
+ pdf_to_grobid(paper.pdf_path, paper.xml_path)
82
+
83
+ return paper
84
+ except Exception as e:
85
+ paper.log("ERROR", f"Repo download failed: {e}")
86
+ return paper
87
+
88
+
89
+ max_workers = 6
90
+ if __name__ == "__main__":
91
+ for venue in VENUE_ORDER:
92
+ df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
93
+ df = df.replace('\t', ' ', regex=True)
94
+ df = df.replace('[]', '')
95
+ df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
96
+
97
+ papers = []
98
+ for venue in VENUE_ORDER:
99
+ paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
100
+ paper_list["Venue"] = venue
101
+ for _, row in paper_list.iterrows():
102
+ if (row.iloc[0] == ""):
103
+ continue
104
+
105
+ if (row.iloc[1] == ""):
106
+ continue
107
+ papers.append(Paper.from_raw(row))
108
+
109
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
110
+ papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs"))
111
+
112
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
113
+ papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs"))
114
+
115
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
116
+ papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos"))
117
+
118
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
119
+ papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs"))
120
+
121
+ results = [p.to_dict() for p in papers]
122
+ results_df = pd.DataFrame(results)
123
+ results_df.to_csv("data/papers.csv", sep="\t", index=False)
{data → data_generation/paper_scraping}/fetch_arxiv.py RENAMED
@@ -1,10 +1,23 @@
1
  import pandas as pd
2
  import requests
3
  import pdfplumber
 
4
  import re
5
  from multiprocessing import Pool, cpu_count
6
  from functools import partial
 
7
  import os
 
 
 
 
 
 
 
 
 
 
 
8
  # Function to process each URL
9
  def process_arxiv_paper(article_link):
10
  try:
@@ -16,69 +29,70 @@ def process_arxiv_paper(article_link):
16
 
17
  article_id = article_link.split("/")[-1]
18
  pdf_url = f'https://arxiv.org/pdf/{article_id}'
19
- response = requests.get(pdf_url)
20
- if response.status_code == 200:
21
- with open(f"{article_id}.pdf", 'wb') as file:
22
- file.write(response.content)
23
- if (response.status_code == 404):
24
- print("Failed to fetch pdf")
25
- return None
26
-
27
- urls = []
28
- link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
29
- with pdfplumber.open(f"{article_id}.pdf") as pdf:
30
- # Loop through all pages
31
- for page_num, page in enumerate(pdf.pages):
32
- # Extract text from the page
33
- text = page.extract_text()
34
-
35
- # Search for a specific word or phrase
36
- found_urls = re.findall(link_pattern, text)
37
- urls.extend(found_urls)
38
- os.remove(f"{article_id}.pdf")
39
- urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
40
- print(urls)
41
- url = urls[0] if len(urls) > 0 else ""
42
-
43
- # Return a dictionary of the results
44
- return {"venue": "arXiv", "title": title, "url": url, "year": year}
45
 
46
  except Exception as e:
47
  print(f"Error processing {article_link}: {e}")
48
  return None
49
 
50
- # Set debug mode
51
- debug = False
52
- # Fetch all URLs for each year
53
- all_year_urls = []
54
 
55
- page_size = 50
56
- search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
57
  articles = []
58
- for search_query in search_queries:
59
- page = 0
60
- while (page <= 100):
61
- start_idx = page_size * page
62
- url = f"{search_query}{start_idx}"
63
- current_page = requests.get(url).text
64
- pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
65
- matches = re.findall(pattern, current_page)
66
- if (len(matches) == 0):
67
- break
68
- else:
69
- page += 1
70
 
71
- articles += matches
72
- articles = np.unique(articles)
73
 
74
  # Parallel processing using Pool
75
  if __name__ == "__main__":
76
- with Pool(processes=4) as pool:
77
- results = pool.starmap(process_arxiv_paper, [[article] for article in articles])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # Filter out any None results due to errors
80
  results = [result for result in results if result is not None]
81
 
82
  # Convert the list of dictionaries to a DataFrame
83
  arxiv = pd.DataFrame(results)
84
- arxiv.to_csv('arxiv.csv')
 
1
  import pandas as pd
2
  import requests
3
  import pdfplumber
4
+ import numpy as np
5
  import re
6
  from multiprocessing import Pool, cpu_count
7
  from functools import partial
8
+ import urllib, urllib.request
9
  import os
10
+ import sys
11
+ from tqdm import tqdm
12
+ from tqdm.contrib.concurrent import process_map # better for multiprocessing
13
+ import feedparser
14
+ import time
15
+ from datetime import datetime
16
+ from tqdm import tqdm
17
+
18
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
19
+ sys.path.append(ROOT_DIR)
20
+
21
  # Function to process each URL
22
  def process_arxiv_paper(article_link):
23
  try:
 
29
 
30
  article_id = article_link.split("/")[-1]
31
  pdf_url = f'https://arxiv.org/pdf/{article_id}'
32
+
33
+ urls = [] # fetch_url(pdf_url)
34
+
35
+ return {"title": title, "year": year, "pdf": pdf_url, "url": urls}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  except Exception as e:
38
  print(f"Error processing {article_link}: {e}")
39
  return None
40
 
41
+ page_size = 100
42
+ base_query = "http://export.arxiv.org/api/query"
43
+ query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"
 
44
 
 
 
45
  articles = []
46
+ start = 0
47
+ max_empty_pages = 3 # stop early if we hit consecutive empty pages
48
+ empty_pages = 0
 
 
 
 
 
 
 
 
 
49
 
 
 
50
 
51
  # Parallel processing using Pool
52
  if __name__ == "__main__":
53
+ print("Fetching arXiv article URLs...")
54
+
55
+ while True:
56
+ # Build URL with pagination
57
+ url = f"{base_query}?{query_params}&start={start}"
58
+
59
+ # Parse the Atom feed
60
+ feed = feedparser.parse(url)
61
+ entries = feed.entries
62
+
63
+ if not entries:
64
+ empty_pages += 1
65
+ print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}")
66
+ if empty_pages >= max_empty_pages:
67
+ print("Stopping early due to repeated empty results.")
68
+ break
69
+ time.sleep(4)
70
+ start += page_size
71
+ continue
72
+
73
+ empty_pages = 0 # reset empty count on success
74
+
75
+ for entry in entries:
76
+ pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
77
+ if pub_date >= datetime(2018, 1, 1):
78
+ articles.append(entry.link)
79
+
80
+ # Log progress
81
+ print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")
82
+
83
+ # Stop if fewer than full page — probably the last one
84
+ if len(entries) < page_size:
85
+ print("Reached last page of results.")
86
+ break
87
+
88
+ start += page_size
89
+ time.sleep(4) # Respect rate limit
90
+ articles = np.unique(articles)
91
+ results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")
92
 
93
  # Filter out any None results due to errors
94
  results = [result for result in results if result is not None]
95
 
96
  # Convert the list of dictionaries to a DataFrame
97
  arxiv = pd.DataFrame(results)
98
+ arxiv.to_csv('data/raw/arxiv.csv')
data_generation/paper_scraping/fetch_miccai.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import re
4
+ from tqdm import tqdm
5
+ from tqdm.contrib.concurrent import process_map
6
+ from multiprocessing import cpu_count
7
+
8
+ # --- Parse a single paper page ---
9
+ def process_paper(year, url):
10
+ try:
11
+ paper_page = requests.get(url).text
12
+
13
+ # Title
14
+ title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL)
15
+ title = title_match.group(1).strip() if title_match else ""
16
+
17
+ # Code repo link
18
+ code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL)
19
+ code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""
20
+
21
+ # Dataset info
22
+ dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL)
23
+ dataset_info = "Yes" if dataset_match else "No"
24
+
25
+ return {
26
+ "title": title,
27
+ "year": year,
28
+ "url": code_repo_link,
29
+ "public": dataset_info
30
+ }
31
+
32
+ except Exception as e:
33
+ print(f"Error processing {url}: {e}")
34
+ return None
35
+
36
+ # --- Main Execution ---
37
+ if __name__ == "__main__":
38
+ MICCAI_pages = [
39
+ "https://miccai2021.org/openaccess/paperlinks/",
40
+ "https://conferences.miccai.org/2022/papers/",
41
+ "https://conferences.miccai.org/2023/papers/",
42
+ "https://papers.miccai.org/miccai-2024/"
43
+ ]
44
+ MICCAI_root = [
45
+ "https://miccai2021.org/openaccess/paperlinks/",
46
+ "https://conferences.miccai.org",
47
+ "https://conferences.miccai.org",
48
+ "https://papers.miccai.org"
49
+ ]
50
+ years = [2021, 2022, 2023, 2024]
51
+
52
+ all_year_urls = []
53
+
54
+ print("🔍 Fetching paper URLs by year...")
55
+ for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
56
+ try:
57
+ response = requests.get(MICCAI_pages[i])
58
+ year_page = response.text
59
+ if years[i] == 2024:
60
+ matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
61
+ urls = [MICCAI_root[i] + match for match in matches]
62
+ else:
63
+ urls = [
64
+ MICCAI_root[i] + line.split('href="')[1].split('"')[0]
65
+ for line in year_page.split('\n')
66
+ if "&bullet" in line and 'href="' in line
67
+ ]
68
+ all_year_urls.extend([(years[i], url) for url in urls])
69
+ except Exception as e:
70
+ print(f"Failed to fetch year {years[i]}: {e}")
71
+
72
+ print(f"📄 Total papers found: {len(all_year_urls)}")
73
+
74
+ # --- Parallel scrape each paper page ---
75
+ print("⚙️ Processing paper metadata...")
76
+ results = process_map(
77
+ process_paper,
78
+ [item[0] for item in all_year_urls],
79
+ [item[1] for item in all_year_urls],
80
+ max_workers=12,
81
+ chunksize=1,
82
+ desc="Parsing Papers"
83
+ )
84
+
85
+ results = [r for r in results if r is not None]
86
+
87
+ miccai = pd.DataFrame(results)
88
+ miccai.to_csv('data/raw/miccai.csv', index=False)
89
+ print("✅ Saved to data/miccai.csv")
{data → data_generation/paper_scraping}/fetch_nature.py RENAMED
@@ -1,19 +1,23 @@
1
  import pandas as pd
2
  import requests
 
3
  import re
4
  from multiprocessing import Pool, cpu_count
5
- from functools import partial
 
 
 
6
 
7
  # Function to process each URL
8
  def process_nature_paper(article_link):
9
  try:
10
- url = f'https://www.nature.com/articles/{article_link}'
11
- article_text = requests.get(url).text
12
 
13
  pattern = r'Code availability.*?<a href="([^"]+)"'
14
  matches = re.findall(pattern, article_text, re.DOTALL)
15
  urls = [link for link in matches if "github" in link]
16
- url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
17
 
18
  year = re.findall(r'datetime="(\d{4})', article_text)[0]
19
  # # Find title
@@ -25,10 +29,10 @@ def process_nature_paper(article_link):
25
  dataset_info = "Yes" if (len(matches) > 0) else "No"
26
 
27
  # # Return a dictionary of the results
28
- return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}
29
 
30
  except Exception as e:
31
- print(f"Error processing {url}: {e}")
32
  return None
33
 
34
  # Set debug mode
@@ -36,33 +40,29 @@ debug = False
36
 
37
  # Fetch all URLs for each year
38
  all_year_urls = []
39
- search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
40
  articles = []
41
- for search_query in search_queries:
 
42
  page = 1
43
- while (page <= 100):
44
  url = f"{search_query}{page}"
45
  current_page = requests.get(url).text
46
  pattern = r'href="/articles/([^"]+)"'
47
  matches = re.findall(pattern, current_page)
48
- if (len(matches) == 0):
49
  break
50
  else:
51
  page += 1
52
-
53
- articles += matches
54
  articles = np.unique(articles)
55
 
56
 
57
  # Parallel processing using Pool
58
  if __name__ == "__main__":
59
- with Pool(processes=12) as pool:
60
- results = pool.starmap(process_nature_paper, [[article] for article in articles])
61
-
62
- # Filter out any None results due to errors
63
  results = [result for result in results if result is not None]
64
 
65
- # Convert the list of dictionaries to a DataFrame
66
  nature = pd.DataFrame(results)
67
  nature = nature[['title', 'year', 'pdf', 'url', 'public']]
68
- nature.to_csv('nature.csv')
 
1
  import pandas as pd
2
  import requests
3
+ import os
4
  import re
5
  from multiprocessing import Pool, cpu_count
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ from tqdm.contrib.concurrent import process_map # better for multiprocessing
9
+
10
 
11
  # Function to process each URL
12
  def process_nature_paper(article_link):
13
  try:
14
+ pdf_url = f'https://www.nature.com/articles/{article_link}'
15
+ article_text = requests.get(pdf_url).text
16
 
17
  pattern = r'Code availability.*?<a href="([^"]+)"'
18
  matches = re.findall(pattern, article_text, re.DOTALL)
19
  urls = [link for link in matches if "github" in link]
20
+ # url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
21
 
22
  year = re.findall(r'datetime="(\d{4})', article_text)[0]
23
  # # Find title
 
29
  dataset_info = "Yes" if (len(matches) > 0) else "No"
30
 
31
  # # Return a dictionary of the results
32
+ return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info}
33
 
34
  except Exception as e:
35
+ print(f"Error processing {pdf_url}: {e}")
36
  return None
37
 
38
  # Set debug mode
 
40
 
41
  # Fetch all URLs for each year
42
  all_year_urls = []
43
+ search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="]
44
  articles = []
45
+
46
+ for search_query in tqdm(search_queries, desc="Search Queries"):
47
  page = 1
48
+ while page <= 100:
49
  url = f"{search_query}{page}"
50
  current_page = requests.get(url).text
51
  pattern = r'href="/articles/([^"]+)"'
52
  matches = re.findall(pattern, current_page)
53
+ if not matches:
54
  break
55
  else:
56
  page += 1
57
+ articles += matches
 
58
  articles = np.unique(articles)
59
 
60
 
61
  # Parallel processing using Pool
62
  if __name__ == "__main__":
63
+ results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles")
 
 
 
64
  results = [result for result in results if result is not None]
65
 
 
66
  nature = pd.DataFrame(results)
67
  nature = nature[['title', 'year', 'pdf', 'url', 'public']]
68
+ nature.to_csv('data/raw/nature.csv')
evaluations/documentation.py CHANGED
@@ -1,14 +1,15 @@
1
- from .utils import log,fetch_code
2
  import re
3
  import numpy as np
 
4
 
5
- def is_applicable(verbose, llm, readme):
6
  res_training = "NA"
7
  res_evaluation = "NA"
8
  res_weights = "NA"
9
 
10
  if (llm):
11
- log(verbose, "TITLE", "\nChecking what parts of the evaluations are applicable...")
12
  res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
13
  res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
14
  res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
@@ -16,31 +17,28 @@ def is_applicable(verbose, llm, readme):
16
  applicable = f"{res_training}/{res_evaluation}/{res_weights}"
17
  return applicable
18
 
19
- def evaluate(verbose, llm, zip, readme):
20
- log(verbose, "TITLE", "\nEvaluating code documentation...")
21
  overall = "No"
22
 
23
-
24
  code_to_comment_ratio = get_code_to_comment_ratio(zip)
25
- log(verbose, "LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
26
-
27
-
28
- if (readme):
29
  non_empty_rows = [row for row in readme.split("\n") if row != ""]
30
  if (len(non_empty_rows) < 5):
31
- log(verbose, "ERROR", "Readme file has very few lines")
32
 
33
  if (llm):
34
  code = fetch_code(zip)
35
  if (llm):
36
  summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
37
- log(verbose, "LOG", f"Based on the code, your readme file could be something like...\n{summary}")
38
  return overall
39
 
40
- if (count_code_lines(non_empty_rows) > 5):
41
- log(verbose, "LOG", "Readme file contains python examples.")
42
- overall = "Yes"
43
- return overall
44
 
45
 
46
  if (llm):
@@ -50,26 +48,30 @@ def evaluate(verbose, llm, zip, readme):
50
  and evaluate the proposed model?'
51
  llm.predict("HELP", prompt)
52
 
 
 
 
 
 
 
53
 
54
- manual_fail = False
55
  if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
56
- log(verbose, "ERROR", "Readme file missing training information")
57
- overall = "No"
58
- if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
59
- log(verbose, "ERROR", "Readme file missing testing information")
60
- overall = "No"
61
 
62
- if ((len(re.findall("example", readme, re.IGNORECASE)) > 0)):
63
- log(verbose, "LOG", "Readme file contains links to examples")
64
- overall = "Yes"
 
65
 
66
- if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
67
- (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
68
- (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
69
- log(verbose, "ERROR", "Readme file missing information about package dependencies")
70
- overall = "No"
71
 
72
- return overall
 
73
 
74
  def count_comment_lines(lines):
75
  # Initialize counters
 
1
+ from .utils import fetch_code
2
  import re
3
  import numpy as np
4
+ from core.conversion import noop_logger
5
 
6
+ def is_applicable(llm, readme, log_fn=noop_logger):
7
  res_training = "NA"
8
  res_evaluation = "NA"
9
  res_weights = "NA"
10
 
11
  if (llm):
12
+ log_fn("TITLE", "\nChecking what parts of the evaluations are applicable...")
13
  res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
14
  res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
15
  res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
 
17
  applicable = f"{res_training}/{res_evaluation}/{res_weights}"
18
  return applicable
19
 
20
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
21
+ log_fn("TITLE", "\nEvaluating code documentation...")
22
  overall = "No"
23
 
 
24
  code_to_comment_ratio = get_code_to_comment_ratio(zip)
25
+ log_fn("LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
26
+ result = { "dependencies": "No", "training": "No", "evaluation": "No", "weights": "No", "scripts": "No" }
27
+ for readme in readmes:
 
28
  non_empty_rows = [row for row in readme.split("\n") if row != ""]
29
  if (len(non_empty_rows) < 5):
30
+ log_fn("ERROR", "Readme file has very few lines")
31
 
32
  if (llm):
33
  code = fetch_code(zip)
34
  if (llm):
35
  summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
36
+ log_fn("LOG", f"Based on the code, your readme file could be something like...\n{summary}")
37
  return overall
38
 
39
+ if (count_code_lines(non_empty_rows) > 2):
40
+ log_fn("LOG", "Readme file contains python examples.")
41
+ result["scripts"] = "Yes"
 
42
 
43
 
44
  if (llm):
 
48
  and evaluate the proposed model?'
49
  llm.predict("HELP", prompt)
50
 
51
+ if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
52
+ (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
53
+ (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
54
+ log_fn("ERROR", "Readme file missing information about package dependencies")
55
+ else:
56
+ result["dependencies"] = "Yes"
57
 
 
58
  if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
59
+ log_fn("ERROR", "Readme file missing training information")
60
+ else:
61
+ result["training"] = "Yes"
 
 
62
 
63
+ if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
64
+ log_fn("ERROR", "Readme file missing testing information")
65
+ else:
66
+ result["evaluating"] = "Yes"
67
 
68
+ if ((len(re.findall("example", readme, re.IGNORECASE)) == 0)):
69
+ log_fn("LOG", "Readme file contains no links to examples")
70
+ else:
71
+ result["evaluating"] = "Yes"
 
72
 
73
+ score = np.sum(np.array(list(result.values()), dtype=str) == "Yes")
74
+ return "Yes" if score >= 2 else "No"
75
 
76
  def count_comment_lines(lines):
77
  # Initialize counters
evaluations/license.py CHANGED
@@ -1,10 +1,10 @@
1
- from .utils import log
2
  import re
 
3
 
4
- def evaluate(verbose, llm, zip, readme):
5
- log(verbose, "TITLE", "\nEvaluating repository licensing...")
6
  overall = "No"
7
- license_files = [license for license in zip.namelist() if ((("LICENSE" in license) | ("license" in license)) & (len(license.split("/")) == 2))]
8
  if (len(license_files) > 0):
9
  license = zip.open(license_files[0]).read().decode("utf-8")
10
  ans = [row for row in license.split("\n") if row != ""]
@@ -13,18 +13,18 @@ def evaluate(verbose, llm, zip, readme):
13
  license = license
14
  prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
15
  ans = llm.predict("HELP", prompt)
16
- log(verbose, "LOG", f"Found license: {ans}")
17
  else:
18
- log(verbose, "LOG", f"Found license file: {license_files[0]}")
19
 
20
  overall = "Yes"
21
  return overall
22
 
23
- if (readme):
24
- if ("License" in readme):
25
- log(verbose, "LOG", "License found in README.")
26
- overall = "Yes"
27
- return overall
28
 
29
- log(verbose, "ERROR", "LICENSE file not found.")
30
  return overall
 
 
1
  import re
2
+ from core.conversion import noop_logger
3
 
4
+ def evaluate(llm, zip, readme, log_fn=noop_logger):
5
+ log_fn("TITLE", "\nEvaluating repository licensing...")
6
  overall = "No"
7
+ license_files = [license_path for license_path in zip.namelist() if ((("license" in license_path.lower())) & (len(license_path.split("/")) == 2))]
8
  if (len(license_files) > 0):
9
  license = zip.open(license_files[0]).read().decode("utf-8")
10
  ans = [row for row in license.split("\n") if row != ""]
 
13
  license = license
14
  prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
15
  ans = llm.predict("HELP", prompt)
16
+ log_fn("LOG", f"Found license: {ans}")
17
  else:
18
+ log_fn("LOG", f"Found license file: {license_files[0]}")
19
 
20
  overall = "Yes"
21
  return overall
22
 
23
+ for readme_file in readme:
24
+ if ("license" in readme_file.lower()):
25
+ log_fn("LOG", "License found in README.")
26
+ overall = "Yes"
27
+ return overall
28
 
29
+ log_fn("ERROR", "LICENSE file not found.")
30
  return overall
evaluations/pitfalls.py CHANGED
@@ -1,13 +1,14 @@
1
- from .utils import log, fetch_code
2
  import re
 
3
 
4
- def evaluate(verbose, llm, zip, readme):
5
- log(verbose, "TITLE", "\nLooking for common pitfalls (in development)...")
6
  codebase = fetch_code(zip)
7
 
8
  if (llm):
9
  for code in codebase:
10
  pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
11
  if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
12
- log(verbose, "ERROR", f"Found possible issues in {code}")
13
- log(verbose, "LOG", llm.predict("PITFALL", f"File name {code} file {codebase[code]}\n Can you find any signs of common pitfalls in this code?"))
 
1
+ from .utils import fetch_code
2
  import re
3
+ from core.conversion import noop_logger
4
 
5
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
6
+ log_fn("TITLE", "\nLooking for common pitfalls (in development)...")
7
  codebase = fetch_code(zip)
8
 
9
  if (llm):
10
  for code in codebase:
11
  pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
12
  if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
13
+ log_fn("ERROR", f"Found possible issues in {code}")
14
+ log_fn("LOG", llm.predict("PITFALL", f"File name {code} file {codebase[code]}\n Can you find any signs of common pitfalls in this code?"))
evaluations/repo_evaluations.py CHANGED
@@ -2,84 +2,71 @@ import pandas as pd
2
  import os
3
  from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
4
  from evaluations.utils import *
 
 
5
  import zipfile
 
 
6
  import os
7
  import numpy as np
8
  from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
9
 
10
- def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
11
  try:
12
  if (not(llm)):
13
- log(verbose, "LOG", "No LLM will be used for the evaluation.")
14
 
15
- results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
16
 
17
- if ((title != None) & (year != None) & (title != "") & (year != "")):
18
- res = fetch_openalex(verbose, title, year)
19
- if ((res != None)):
20
- res = res["results"]
21
- if (len(res) > 0):
22
- res = res[0]
23
- results["pred_citations"] = res["cited_by_count"]
24
-
25
- if (get_api_link(repo_url) != ""):
26
- results["pred_valid"] = True
27
- else:
28
- return results
29
-
30
- username, repo_name = decompose_url(repo_url)
31
-
32
- # If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
33
- if (zip is None):
34
- token = os.getenv("githubToken")
35
- repository_zip_name = "data/repo.zip"
36
- log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
37
-
38
- fetch_repo(verbose, repo_url, repository_zip_name, token)
39
 
40
- if (not(os.path.exists(repository_zip_name))):
41
- results["pred_live"] = "No"
42
- return results
43
-
44
- results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
45
 
46
- zip = zipfile.ZipFile(repository_zip_name)
 
 
47
 
48
- readme = fetch_readme(zip)
49
- results["NA"] = documentation.is_applicable(verbose, llm, readme)
50
-
51
- results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
52
 
53
  if (len(zip.namelist()) <= 2):
54
- log(verbose, "LOG", "The repository is empty.")
55
 
56
- results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
57
- results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
58
- results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
59
- results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
60
- results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
61
- results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
62
- pitfalls.evaluate(verbose, llm, zip, readme)
63
-
64
- return results
65
  except Exception as e:
66
- log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
67
- results["pred_live"] = "No"
68
- return results
69
-
70
- def full_evaluation():
71
- paper_dump = pd.read_csv("data/zipfiles.csv", sep="\t")
72
- full_results = []
73
-
74
- for idx, row in paper_dump.iterrows():
75
-
76
- if (pd.isna(row["url"]) | (row["url"] == "")):
77
- continue
78
-
79
- print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
80
- result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
81
- for column in result.keys():
82
- row[column] = result[column]
83
-
84
- full_results.append(row)
85
- return pd.DataFrame(full_results)
 
2
  import os
3
  from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
4
  from evaluations.utils import *
5
+ from core.conversion import fetch_repo, decompose_url
6
+
7
  import zipfile
8
+ import csv
9
+
10
  import os
11
  import numpy as np
12
  from huggingface_hub import InferenceClient
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from core.conversion import noop_logger
15
+
16
+ token = os.getenv("githubToken")
17
+ def evaluate(llm, paper, log_fn=noop_logger):
18
+ repo_url = paper.main_repo_url
19
+ title = paper.title
20
+ year = paper.year
21
+ zip=zipfile.ZipFile(paper.zip_path)
22
 
 
23
  try:
24
  if (not(llm)):
25
+ log_fn("LOG", "No LLM will be used for the evaluation.")
26
 
27
+ paper.code_repro_auto = { "live": "Yes", "dependencies": None, "training": None, "evaluation": None, "weights": None, "readme": None, "license": None, "stars": None, "citations": None, "valid": False}
28
 
29
+ # if ((title != None) & (year != None) & (title != "") & (year != "")):
30
+ # res = fetch_openalex(title, year, log_fn=log_fn)
31
+ # if ((res != None)):
32
+ # res = res["results"]
33
+ # if (len(res) > 0):
34
+ # res = res[0]
35
+ # paper.code_repro_auto["citations"] = res["cited_by_count"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # if (get_api_link(repo_url) != ""):
38
+ # paper.code_repro_auto["valid"] = True
39
+ # else:
40
+ # return paper.code_repro_auto
 
41
 
42
+ # paper.code_repro_auto["stars"] = fetch_repo_stars(repo_url, token, log_fn)
43
+ readmes = fetch_readmes(zip)
44
+ paper.code_repro_auto["NA"] = documentation.is_applicable(llm, readmes, log_fn)
45
 
46
+ paper.code_repro_auto["license"] = license.evaluate(llm, zip, readmes, log_fn)
 
 
 
47
 
48
  if (len(zip.namelist()) <= 2):
49
+ log_fn("LOG", "The repository is empty.")
50
 
51
+ paper.code_repro_auto["dependencies"] = requirements.evaluate(llm, zip, readmes, log_fn)
52
+ paper.code_repro_auto["training"] = training.evaluate(llm, zip, readmes, log_fn)
53
+ paper.code_repro_auto["evaluation"] = validating.evaluate(llm, zip, readmes, log_fn)
54
+ paper.code_repro_auto["weights"] = weights.evaluate(llm, zip, readmes, log_fn)
55
+ paper.code_repro_auto["readme"] = documentation.evaluate(llm, zip, readmes, log_fn)
56
+ paper.code_repro_auto["codetocomment"] = documentation.get_code_to_comment_ratio(zip)
57
+ pitfalls.evaluate(llm, zip, readmes, log_fn)
58
+
59
+ return paper
60
  except Exception as e:
61
+ log_fn("ERROR", "Evaluating repository failed: " + str(e))
62
+ paper.code_repro_auto["live"] = "No"
63
+ return paper
64
+
65
+ def process_row(paper):
66
+ if ((paper.zip_path is None) or (not(os.path.exists(paper.zip_path)))):
67
+ paper.log("ERROR", "Zip file doesn't exist")
68
+ return paper
69
+
70
+ paper = evaluate(None, paper, paper.log)
71
+
72
+ return paper
 
 
 
 
 
 
 
 
evaluations/requirements.py CHANGED
@@ -1,24 +1,26 @@
1
- from .utils import log
2
 
3
- def evaluate(verbose, llm, zip, readme):
4
- log(verbose, "TITLE", "\nLooking for package dependencies for running the code...")
5
  overall = "No"
6
 
7
  scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
8
 
9
  files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
10
- files = [file_path for file_path in files if len(file_path.split("/")) == 2]
11
  for file in files:
12
- log(verbose, "LOG", f"Found requirements file: {file}")
13
  requirements = zip.open(file).read().decode("utf-8")
14
- overall = "Yes"
15
  if (len(requirements.split("\n")) < 5):
16
- log(verbose, "ERROR", "Requirements file contains too few lines.")
17
- overall = "No"
18
-
19
- if (readme):
20
- if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
21
- log(verbose, "LOG", "Found dependencies in README file")
22
- overall = "Yes"
 
 
23
 
24
  return overall
 
1
+ from core.conversion import noop_logger
2
 
3
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
4
+ log_fn("TITLE", "\nLooking for package dependencies for running the code...")
5
  overall = "No"
6
 
7
  scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
8
 
9
  files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
10
+ # files = [file_path for file_path in files if len(file_path.split("/")) == 2]
11
  for file in files:
12
+ log_fn("LOG", f"Found requirements file: {file}")
13
  requirements = zip.open(file).read().decode("utf-8")
14
+
15
  if (len(requirements.split("\n")) < 5):
16
+ log_fn("ERROR", "Requirements file contains too few lines.")
17
+ continue
18
+ overall = "Yes"
19
+
20
+ for readme in readmes:
21
+ if (readme):
22
+ if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
23
+ log_fn("LOG", "Found dependencies in README file")
24
+ overall = "Yes"
25
 
26
  return overall
evaluations/training.py CHANGED
@@ -1,8 +1,8 @@
1
- from .utils import log
2
  import re
 
3
 
4
- def evaluate(verbose, llm, zip, readme):
5
- log(verbose, "TITLE", "\nLooking for code to train the model...")
6
  overall = "No"
7
 
8
 
@@ -23,15 +23,15 @@ def evaluate(verbose, llm, zip, readme):
23
  for framework, regex_list in patterns.items():
24
  for pattern in regex_list:
25
  if re.search(pattern, code):
26
- log(verbose, "LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
27
  overall = "Yes"
28
 
29
-
30
- if (readme):
31
- if (("train" in readme)):
32
- log(verbose, "LOG", "Found something about training in README file")
33
- overall = "Yes"
34
 
35
  if (overall == "No"):
36
- log(verbose, "ERROR", "Found no code for training the model.")
37
  return overall
 
 
1
  import re
2
+ from core.conversion import noop_logger
3
 
4
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
5
+ log_fn("TITLE", "\nLooking for code to train the model...")
6
  overall = "No"
7
 
8
 
 
23
  for framework, regex_list in patterns.items():
24
  for pattern in regex_list:
25
  if re.search(pattern, code):
26
+ log_fn("LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
27
  overall = "Yes"
28
 
29
+ for readme in readmes:
30
+ if (readme):
31
+ if (("train" in readme)):
32
+ log_fn("LOG", "Found something about training in README file")
33
+ overall = "Yes"
34
 
35
  if (overall == "No"):
36
+ log_fn("ERROR", "Found no code for training the model.")
37
  return overall
evaluations/url.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import zipfile
4
+ import os
5
+ import re
6
+ from uuid import uuid4
7
+ import pdfplumber
8
+ import numpy as np
9
+ from urllib.request import urlretrieve
10
+ import xml.etree.ElementTree as ET
11
+ import re
12
+ import fitz # PyMuPDF
13
+
14
+ def get_fitz_urls(pdf_path):
15
+ doc = fitz.open(pdf_path)
16
+ urls = []
17
+
18
+ for page in doc:
19
+ for link in page.get_links():
20
+ if 'uri' in link:
21
+ urls.append(link['uri'])
22
+
23
+ return urls
24
+
25
+
26
+ NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}
27
+
28
+ def find_pattern_in_xml(root, pattern):
29
+ """
30
+ Recursively search for a regex pattern in all text fields of an XML tree.
31
+
32
+ :param root: The root Element of the XML tree
33
+ :param pattern: The regex pattern to search for
34
+ :return: A list of matching strings
35
+ """
36
+ matches = []
37
+ regex = re.compile(pattern)
38
+
39
+ # Check element text
40
+ if root.text:
41
+ matches.extend(regex.findall(root.text))
42
+
43
+ # Check element attributes
44
+ for attr_value in root.attrib.values():
45
+ matches.extend(regex.findall(attr_value))
46
+
47
+ # Recursively search in children
48
+ for child in root:
49
+ matches.extend(find_pattern_in_xml(child, pattern))
50
+
51
+ return matches
52
+
53
+ def fetch_url(pdf_path):
54
+ if (pdf_path is None):
55
+ raise ValueError("Pdf has no path")
56
+
57
+ urls = []
58
+ link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b"
59
+
60
+ # if (method == "plumber"):
61
+ full_text = ""
62
+ with pdfplumber.open(pdf_path) as pdf:
63
+ for page in pdf.pages:
64
+ # Extract text from the page and normalize spaces
65
+ text = page.extract_text()
66
+ if text:
67
+ full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "
68
+
69
+ # Find all URLs in the combined text
70
+ found_urls = re.findall(link_pattern, full_text)
71
+ urls.extend(found_urls)
72
+ # elif (method == "grobid"):
73
+ # paper = pdf_to_grobid(file_name)
74
+ # found_urls = find_pattern_in_xml(paper, link_pattern)
75
+ # urls.extend(found_urls)
76
+ # os.remove(file_name)
77
+ # elif (method == "fitz")
78
+ fitz_urls = get_fitz_urls(pdf_path)
79
+ urls.extend(fitz_urls)
80
+ urls = np.unique(urls)
81
+ urls = [s for s in urls if "/" in s]
82
+ urls = [s for s in urls if "git" in s]
83
+ # else:
84
+ # raise Exception("Method unknown")
85
+ return urls
86
+
evaluations/utils.py CHANGED
@@ -4,7 +4,8 @@ import time
4
  import os
5
  import zipfile
6
  import json
7
- import streamlit as st
 
8
 
9
  def fetch_code(zip_file):
10
  zip_content_dict = {}
@@ -14,27 +15,8 @@ def fetch_code(zip_file):
14
  zip_content_dict[file_name] = file_content
15
  return zip_content_dict
16
 
17
- def get_api_link(url):
18
- username, repo_name = decompose_url(url)
19
- if (username == None):
20
- return ""
21
- return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
22
 
23
- def decompose_url(url):
24
- try:
25
- url = url.split("github.com")[1]
26
- url = url.strip(".")
27
- url = url.split(".git")[0]
28
- url = url.strip("/")
29
- parts = url.split("/")
30
- username = parts[0]
31
- repo_name = parts[1]
32
- return username, repo_name
33
- except:
34
- return None, None
35
-
36
-
37
- def fetch_repo_stars(verbose, repo_url, token):
38
  headers = {"Authorization": f"token {token}"}
39
  api_url = get_api_link(repo_url)
40
  api_url = api_url.replace("/zipball/", "")
@@ -45,38 +27,14 @@ def fetch_repo_stars(verbose, repo_url, token):
45
  if response.status_code == 200:
46
  return json.loads(response.content)["stargazers_count"]
47
  if (response.status_code == 404):
48
- log(verbose, "ERROR", "Repository private.")
49
-
50
- def fetch_repo(verbose, repo_url, repo_name, token):
51
- if (os.path.exists(repo_name)):
52
- os.remove(repo_name)
53
-
54
- if ("github.com" not in repo_url):
55
- log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
56
- return
57
-
58
- headers = {"Authorization": f"token {token}"}
59
- api_url = get_api_link(repo_url)
60
-
61
- if (api_url == ""):
62
- log(verbose, "ERROR", f"Failed to parse the URL, please evaluate manually ({repo_url}).")
63
- return
64
 
65
- # Sending GET request to GitHub API
66
- response = requests.get(api_url, headers=headers)
67
-
68
- if response.status_code == 200:
69
- with open(repo_name, 'wb') as file:
70
- file.write(response.content)
71
- if (response.status_code == 404):
72
- log(verbose, "ERROR", "Repository private / Link broken.")
73
-
74
- def fetch_readme(zip):
75
- readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
76
- readme = ""
77
  for readme_file in readme_files:
78
- readme += zip.open(readme_file).read().decode("utf-8") + "\n\n"
79
- return readme
80
 
81
  def fetch_license(zip):
82
  license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
@@ -85,7 +43,7 @@ def fetch_license(zip):
85
  license = zip.open(license_files[0]).read().decode("utf-8")
86
  return license
87
 
88
- def fetch_openalex(verbose, paper_name, year):
89
  api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
90
 
91
  response = requests.get(api_url)
@@ -93,36 +51,7 @@ def fetch_openalex(verbose, paper_name, year):
93
  if response.status_code == 200:
94
  return response.json()
95
  else:
96
- log(verbose, "WARNING", "Could not find OpenAlex information for paper.")
97
-
98
-
99
- def log(verbose, log_type, log_text, hf=False):
100
- if (verbose == 0):
101
- return
102
-
103
- show_tips = (verbose == 2) | (verbose == 4)
104
- if ((verbose == 1) | (verbose == 2)):
105
- show = print
106
- if ((verbose == 3) | (verbose == 4)):
107
- show = st.write
108
-
109
- # Align line-break
110
- if (log_text.startswith("\n")):
111
- show("\n")
112
- log_text = log_text.lstrip('\n')
113
-
114
- # Only show tips in verbose mode 2 and 4
115
- if ((log_type == "TITLE") & show_tips):
116
- show(f"\n#### {log_text}")
117
- if ((log_type == "TIP") & show_tips):
118
- show(f"*{log_text}*")
119
- if ((log_type == "LOG") & show_tips):
120
- show(f"{log_text}")
121
- if ((log_type == "ERROR")):
122
- show(f"**{log_text}**")
123
-
124
- if ((log_type != "TIP") & (log_type != "LOG") & (log_type != "ERROR") & (log_type != "TITLE")):
125
- raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
126
 
127
- def init_llm(verbose):
128
- log(verbose, "LOG", "Initializing LLM...")
 
4
  import os
5
  import zipfile
6
  import json
7
+ from core.conversion import get_api_link
8
+ from core.conversion import noop_logger
9
 
10
  def fetch_code(zip_file):
11
  zip_content_dict = {}
 
15
  zip_content_dict[file_name] = file_content
16
  return zip_content_dict
17
 
 
 
 
 
 
18
 
19
+ def fetch_repo_stars(repo_url, token, log_fn=noop_logger):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  headers = {"Authorization": f"token {token}"}
21
  api_url = get_api_link(repo_url)
22
  api_url = api_url.replace("/zipball/", "")
 
27
  if response.status_code == 200:
28
  return json.loads(response.content)["stargazers_count"]
29
  if (response.status_code == 404):
30
+ log_fn("ERROR", "Repository private.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def fetch_readmes(zip):
33
+ readme_files = [readme for readme in zip.namelist() if (readme.lower().endswith("readme.md") & (len(readme.split("/")) == 2))]
34
+ readmes = []
 
 
 
 
 
 
 
 
 
35
  for readme_file in readme_files:
36
+ readmes.append(zip.open(readme_file).read().decode("utf-8"))
37
+ return readmes
38
 
39
  def fetch_license(zip):
40
  license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
 
43
  license = zip.open(license_files[0]).read().decode("utf-8")
44
  return license
45
 
46
+ def fetch_openalex(paper_name, year, log_fn=noop_logger):
47
  api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
48
 
49
  response = requests.get(api_url)
 
51
  if response.status_code == 200:
52
  return response.json()
53
  else:
54
+ log_fn("WARNING", "Could not find OpenAlex information for paper.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ def init_llm(log_fn=noop_logger):
57
+ log_fn("LOG", "Initializing LLM...")
evaluations/validating.py CHANGED
@@ -1,8 +1,8 @@
1
- from .utils import log
2
  import re
 
3
 
4
- def evaluate(verbose, llm, zip, readme):
5
- log(verbose, "TITLE", "\nLooking for examples for running the model...")
6
  overall = "No"
7
  patterns = {
8
  'tensorflow': [
@@ -23,14 +23,15 @@ def evaluate(verbose, llm, zip, readme):
23
  for framework, regex_list in patterns.items():
24
  for pattern in regex_list:
25
  if re.search(pattern, code):
26
- log(verbose, "LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
27
  overall = "Yes"
28
 
29
- if (readme):
30
- if ((len(re.findall("testing", readme)) > 0)):
31
- log(verbose, "LOG", "Found information about evaluations in readme")
32
- overall = "Yes"
 
33
 
34
  if (overall == "No"):
35
- log(verbose, "ERROR", "Found no code for evaluating the model.")
36
  return overall
 
 
1
  import re
2
+ from core.conversion import noop_logger
3
 
4
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
5
+ log_fn("TITLE", "\nLooking for examples for running the model...")
6
  overall = "No"
7
  patterns = {
8
  'tensorflow': [
 
23
  for framework, regex_list in patterns.items():
24
  for pattern in regex_list:
25
  if re.search(pattern, code):
26
+ log_fn("LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
27
  overall = "Yes"
28
 
29
+ for readme in readmes:
30
+ if (readme):
31
+ if ((len(re.findall("testing", readme)) > 0)):
32
+ log_fn("LOG", "Found information about evaluations in readme")
33
+ overall = "Yes"
34
 
35
  if (overall == "No"):
36
+ log_fn("ERROR", "Found no code for evaluating the model.")
37
  return overall
evaluations/weights.py CHANGED
@@ -1,52 +1,53 @@
1
- from .utils import log
2
  import re
 
3
 
4
- def evaluate(verbose, llm, zip, readme):
5
- log(verbose, "TITLE", "\nLooking for pre-trained model weights...")
6
  overall = "No"
7
  files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
8
  if (len(files) > 0):
9
- log(verbose, "LOG", f"Found model weights: {files}")
10
  overall = "Yes"
11
  return overall
12
 
13
- if (readme):
14
-
15
- url_pattern = r'(https?://[^\s]+)'
16
- urls = re.findall(url_pattern, readme)
17
- if (len([url for url in urls if "pth" in url]) > 0):
18
- log(verbose, "LOG", "Found a link to pre-trained weights in readme")
19
- overall = "Yes"
20
- return overall
21
-
22
- readme_lines = readme.split("\n")
23
- if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
24
- log(verbose, "LOG", "Found a link for 'pretrained' something in readme")
25
- overall = "Yes"
26
- return overall
27
-
28
- if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
29
- log(verbose, "LOG", "Found a link for 'pre-trained' something in readme")
30
- overall = "Yes"
31
- return overall
32
-
33
- if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
34
- log(verbose, "LOG", "Found a link for 'weight' something in readme")
35
- overall = "Yes"
36
- return overall
37
-
38
- if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
39
- log(verbose, "LOG", "Found a link for 'model' something in readme")
40
- overall = "Yes"
41
- return overall
42
-
43
- if (llm):
44
- prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
45
- ans = llm.predict("STRICT", prompt)
46
- if (("Yes" in ans) & ("No" not in ans)):
47
- log(verbose, "LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
48
- overall = "Yes"
49
- return overall
50
-
51
- log(verbose, "ERROR", "Found no pre-trained model weights.")
 
52
  return overall
 
 
1
  import re
2
+ from core.conversion import noop_logger
3
 
4
+ def evaluate(llm, zip, readmes, log_fn=noop_logger):
5
+ log_fn("TITLE", "\nLooking for pre-trained model weights...")
6
  overall = "No"
7
  files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
8
  if (len(files) > 0):
9
+ log_fn("LOG", f"Found model weights: {files}")
10
  overall = "Yes"
11
  return overall
12
 
13
+ for readme in readmes:
14
+ if (readme):
15
+
16
+ url_pattern = r'(https?://[^\s]+)'
17
+ urls = re.findall(url_pattern, readme)
18
+ if (len([url for url in urls if "pth" in url]) > 0):
19
+ log_fn("LOG", "Found a link to pre-trained weights in readme")
20
+ overall = "Yes"
21
+ return overall
22
+
23
+ readme_lines = readme.split("\n")
24
+ if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
25
+ log_fn("LOG", "Found a link for 'pretrained' something in readme")
26
+ overall = "Yes"
27
+ return overall
28
+
29
+ if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
30
+ log_fn("LOG", "Found a link for 'pre-trained' something in readme")
31
+ overall = "Yes"
32
+ return overall
33
+
34
+ if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
35
+ log_fn("LOG", "Found a link for 'weight' something in readme")
36
+ overall = "Yes"
37
+ return overall
38
+
39
+ if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
40
+ log_fn("LOG", "Found a link for 'model' something in readme")
41
+ overall = "Yes"
42
+ return overall
43
+
44
+ if (llm):
45
+ prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
46
+ ans = llm.predict("STRICT", prompt)
47
+ if (("Yes" in ans) & ("No" not in ans)):
48
+ log_fn("LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
49
+ overall = "Yes"
50
+ return overall
51
+
52
+ log_fn("ERROR", "Found no pre-trained model weights.")
53
  return overall
full_eval.py CHANGED
@@ -1,4 +1,19 @@
1
- from evaluations.repo_evaluations import full_evaluation
 
 
 
 
 
2
 
3
- res = full_evaluation()
4
- res.to_csv("data/results.csv", sep="\t", index=False)
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from evaluations.repo_evaluations import process_row
3
+ from evaluations.utils import *
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from tqdm import tqdm
6
+ from core.paper import Paper
7
 
8
+ if __name__ == "__main__":
9
+ paper_dump = pd.read_csv("data/urls.csv", sep="\t")
10
+
11
+ max_workers = 12
12
+ papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
13
+ # papers = [paper for paper in papers if paper.main_repo_url == "https://github.com/AsukaDaisuki/MAT"]
14
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
15
+ papers = list(tqdm(executor.map(process_row, papers), total=len(papers), desc="Running repo evaluations"))
16
+
17
+ results = [p.to_dict() for p in papers]
18
+ results_df = pd.DataFrame(results)
19
+ results_df.to_csv("data/results.csv", sep="\t", index=False)
midl_summary.py DELETED
@@ -1,57 +0,0 @@
1
- import os
2
- import pandas as pd
3
- import numpy as np
4
-
5
-
6
- compare_to_gt = True
7
- ground_truth = pd.read_csv("data/zipfiles.csv", sep="\t")
8
- results = pd.read_csv("data/results.csv", sep="\t")
9
- verbose = 0
10
-
11
- eval_readme = []
12
- eval_training = []
13
- eval_evaluating = []
14
- eval_licensing = []
15
- eval_weights = []
16
- eval_dependencies = []
17
- full_results = []
18
- for (index1, row1), (index2, row2) in zip(ground_truth.iterrows(), results.iterrows()):
19
- if (pd.isna(row1["training"])):
20
- continue
21
-
22
- print(f"\nEvaluating {index1+1} out of {len(ground_truth.index)} papers...")
23
- print(f'Paper title - "{row1["title"]}" ({row1["year"]})')
24
- print(f'Repository link - {row1["url"]}')
25
- if ((not(pd.isna(row1["dependencies"]))) & (row2["pred_dependencies"] is not None)):
26
- eval_dependencies.append(row2["pred_dependencies"] == row1["dependencies"])
27
- if (row2["pred_dependencies"] != row1["dependencies"]):
28
- print(f"Dependencies acc. - {row2['pred_dependencies']} (GT:{row1['dependencies']})")
29
- if ((not(pd.isna(row1["training"]))) & (row2["pred_dependencies"] is not None)):
30
- eval_training.append(row1["training"] == row2["pred_training"])
31
- if (row1["training"] != row2["pred_training"]):
32
- print(f"Training acc. -{row2['pred_training']} (GT:{row1['training']})")
33
- if ((not(pd.isna(row1["evaluation"]))) & (row2["pred_dependencies"] is not None)):
34
- eval_evaluating.append(row1["evaluation"] == row2["pred_evaluation"])
35
- if (row1["evaluation"] != row2["pred_evaluation"]):
36
- print(f"Evaluating acc. - {row2['pred_evaluation']} (GT:{row1['evaluation']})")
37
- if ((not(pd.isna(row1["weights"]))) & (row2["pred_dependencies"] is not None)):
38
- eval_weights.append(row1["weights"] == row2["pred_weights"])
39
- if (row1["weights"] != row2["pred_weights"]):
40
- print(f"Weights acc. - {row2['pred_weights']} (GT:{row1['weights']})")
41
- if ((not(pd.isna(row1["readme"]))) & (row2["pred_dependencies"] is not None)):
42
- eval_readme.append(row1["readme"] == row2["pred_readme"])
43
- if (row1["readme"] != row2["pred_readme"]):
44
- print(f"README acc. - {row2['pred_readme']} (GT:{row1['readme']})")
45
- if ((not(pd.isna(row1["license"]))) & (row2["pred_dependencies"] is not None)):
46
- eval_licensing.append(("No" if row1["license"] == "No" else "Yes") == row2["pred_license"])
47
- if (("No" if row1["license"] == "No" else "Yes") != row2["pred_license"]):
48
- print(f"LICENSE acc. - {row2['pred_license']} (GT:{row1['license']})")
49
-
50
-
51
- print("\nSummary:")
52
- print(f"Dependencies acc. - {int(100 * np.mean(eval_dependencies))}%")
53
- print(f"Training acc. - {int(100 * np.mean(eval_training))}%")
54
- print(f"Evaluating acc. - {int(100 * np.mean(eval_evaluating))}%")
55
- print(f"Weights acc. - {int(100 * np.mean(eval_weights))}%")
56
- print(f"README acc. - {int(100 * np.mean(eval_readme))}%")
57
- print(f"LICENSE acc. - {int(100 * np.mean(eval_licensing))}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
plotting/midl_summary.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os
3
+ import sys
4
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
5
+ sys.path.append(ROOT_DIR)
6
+ import pandas as pd
7
+ import numpy as np
8
+ from core.paper import Paper
9
+
10
+ def compare(ground_truth, automated_truth, key, verbose=False):
11
+ if key not in ground_truth.keys() or key not in automated_truth.keys():
12
+ return np.nan
13
+ if (pd.isna(ground_truth[key]) or pd.isna(automated_truth[key])):
14
+ return np.nan
15
+
16
+ if (key == "license"):
17
+ ground_truth[key] = "No" if ground_truth[key] == "No" else "Yes"
18
+ res = ground_truth[key] == automated_truth[key]
19
+ if verbose and res == False:
20
+ print(f"{key} acc. - {automated_truth[key]} (GT:{ground_truth[key]})")
21
+ return res
22
+
23
+ max_workers = 6
24
+ compare_to_gt = True
25
+ verbose = True
26
+ training = True
27
+
28
+ paper_dump = pd.read_csv("data/results.csv", sep="\t")
29
+ papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
30
+
31
+ eval_readme = []
32
+ eval_training = []
33
+ eval_evaluating = []
34
+ eval_licensing = []
35
+ eval_weights = []
36
+ eval_dependencies = []
37
+ full_results = []
38
+ for idx, paper in enumerate(papers):
39
+ if paper.venue != "MIDL" or paper.main_repo_url is None or (int(paper.year) >= 2024 if training else int(paper.year) < 2024):
40
+ continue
41
+
42
+ if (verbose):
43
+ print(f"\nEvaluating {idx} out of {len(papers)} papers...")
44
+ print(f'Paper title - "{paper.title}" ({paper.year})')
45
+ print(f'Repository link - {paper.main_repo_url}')
46
+ eval_dependencies.append(compare(paper.code_repro_manual, paper.code_repro_auto, "dependencies", verbose))
47
+ eval_training.append(compare(paper.code_repro_manual, paper.code_repro_auto, "training", verbose))
48
+ eval_evaluating.append(compare(paper.code_repro_manual, paper.code_repro_auto, "evaluation", verbose))
49
+ eval_weights.append(compare(paper.code_repro_manual, paper.code_repro_auto, "weights", verbose))
50
+ eval_readme.append(compare(paper.code_repro_manual, paper.code_repro_auto, "readme", verbose))
51
+ eval_licensing.append(compare(paper.code_repro_manual, paper.code_repro_auto, "license", verbose))
52
+
53
+ print("\nSummary:")
54
+ print(f"Dependencies acc. - {int(100 * np.nanmean(eval_dependencies))}%")
55
+ print(f"Training acc. - {int(100 * np.nanmean(eval_training))}%")
56
+ print(f"Evaluating acc. - {int(100 * np.nanmean(eval_evaluating))}%")
57
+ print(f"Weights acc. - {int(100 * np.nanmean(eval_weights))}%")
58
+ print(f"README acc. - {int(100 * np.nanmean(eval_readme))}%")
59
+ print(f"LICENSE acc. - {int(100 * np.nanmean(eval_licensing))}%")
plotting/paper_plots.py CHANGED
@@ -1,41 +1,34 @@
1
  import plotly.express as px
 
 
 
 
2
  import numpy as np
3
-
4
- paper_dump = pd.read_csv('data/dump.csv', sep="\t")
5
- # Calculate total number of URLs per year and venue
6
- custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
7
- total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
8
-
9
- # Calculate the number of URLs with errors per year and venue
10
- total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
11
-
12
- # Merge the DataFrames to calculate the error rate
13
- merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
14
- merged_df['repo_rate'] = merged_df['total_urls'] / merged_df['total_titles']
15
-
16
- # Plot the error rates using Plotly, with year on x-axis and color by venue
17
- fig = px.bar(
18
- merged_df,
19
- x='year',
20
- y='total_titles',
21
- color='venue',
22
- barmode='group',
23
- title=f'Number of papers per venue',
24
- labels={'error_rate': 'Success Rate', 'year': 'Year'},
25
- category_orders={'venue': custom_order}
26
- )
27
-
28
- fig.update_xaxes(range=[2018, 2024])
29
- fig.show()
30
-
31
  import plotly.express as px
32
  import numpy as np
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Calculate total number of URLs per year and venue
35
  total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
36
 
37
  # Calculate the number of URLs with errors per year and venue
38
- total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
 
 
39
 
40
  # Merge the DataFrames to calculate the error rate
41
  merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
@@ -50,25 +43,10 @@ fig = px.bar(
50
  barmode='group',
51
  title=f'Number of papers per venue',
52
  labels={'error_rate': 'Success Rate', 'year': 'Year'},
53
- category_orders={'venue': custom_order}
54
- )
55
-
56
- fig.update_xaxes(range=[2018, 2024])
57
- fig.show()
58
-
59
- # Plot the error rates using Plotly, with year on x-axis and color by venue
60
- fig = px.bar(
61
- merged_df,
62
- x='year',
63
- y='total_urls',
64
- color='venue',
65
- barmode='group',
66
- title=f'Number of papers per venue',
67
- labels={'error_rate': 'Success Rate', 'year': 'Year'},
68
- category_orders={'venue': custom_order}
69
  )
70
 
71
- fig.update_xaxes(range=[2018, 2024])
72
  fig.show()
73
 
74
 
@@ -81,9 +59,9 @@ fig = px.bar(
81
  barmode='group',
82
  title=f'Number of repositories per venue',
83
  labels={'error_rate': 'Success Rate', 'year': 'Year'},
84
- category_orders={'venue': custom_order}
85
  )
86
- fig.update_xaxes(range=[2018, 2024])
87
  fig.update_yaxes(range=[0, 1])
88
 
89
  fig.show()
 
1
  import plotly.express as px
2
+ import os
3
+ import sys
4
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
5
+ sys.path.append(ROOT_DIR)
6
  import numpy as np
7
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import plotly.express as px
9
  import numpy as np
10
+ from config.constants import VENUE_ORDER
11
+ import json
12
+ from core.paper import _parse_url_field
13
 
14
+ paper_dump = pd.read_csv('data/urls.csv', sep="\t")
15
+ paper_dump['urls_manual'] = paper_dump['urls_manual'].apply(
16
+ lambda x: _parse_url_field(json.loads(x))
17
+ )
18
+ paper_dump['urls_auto'] = paper_dump['urls_auto'].apply(
19
+ lambda x: _parse_url_field(json.loads(x))
20
+ )
21
+ paper_dump['url'] = paper_dump.apply(
22
+ lambda row: next((u for u in [*row['urls_manual'], *row['urls_auto']] if "github.com" in u), None),
23
+ axis=1
24
+ )
25
  # Calculate total number of URLs per year and venue
26
  total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
27
 
28
  # Calculate the number of URLs with errors per year and venue
29
+ total_url_per_venue = paper_dump[
30
+ paper_dump["url"].notna() & (paper_dump["url"] != "")
31
+ ].groupby(['year', 'venue']).size().reset_index(name='total_urls')
32
 
33
  # Merge the DataFrames to calculate the error rate
34
  merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
 
43
  barmode='group',
44
  title=f'Number of papers per venue',
45
  labels={'error_rate': 'Success Rate', 'year': 'Year'},
46
+ category_orders={'venue': VENUE_ORDER}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  )
48
 
49
+ fig.update_xaxes(range=[2018, 2025])
50
  fig.show()
51
 
52
 
 
59
  barmode='group',
60
  title=f'Number of repositories per venue',
61
  labels={'error_rate': 'Success Rate', 'year': 'Year'},
62
+ category_orders={'venue': VENUE_ORDER}
63
  )
64
+ fig.update_xaxes(range=[2018, 2025])
65
  fig.update_yaxes(range=[0, 1])
66
 
67
  fig.show()
plotting/print_incorrect.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+ import pandas as pd
3
+ import re
4
+
5
+ # Define columns for all relevant predictions
6
+ pred_columns = ['pred_dependencies', 'pred_training',
7
+ 'pred_evaluation', 'pred_weights', 'pred_readme',
8
+ 'pred_license']
9
+
10
+ # Define the real and predicted column pairs
11
+ real_pred_columns = {
12
+ 'dependencies': 'pred_dependencies',
13
+ 'training': 'pred_training',
14
+ 'evaluation': 'pred_evaluation',
15
+ 'weights': 'pred_weights',
16
+ 'readme': 'pred_readme',
17
+ 'license': 'pred_license'
18
+ }
19
+
20
+ df = pd.read_csv('data/results.csv', sep="\t")
21
+
22
+ # Cleanup
23
+ df['year'] = pd.to_numeric(df['year'], errors='coerce')
24
+ df = df.dropna(subset=['year'])
25
+ df['year'] = df['year'].astype(int)
26
+
27
+ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
28
+
29
+ # Group by venue
30
+ df_filtered = df[df['pred_live'] == "Yes"].copy()
31
+ df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
32
+
33
+ # Add matching counts for each category
34
+ for real, pred in real_pred_columns.items():
35
+ df_filtered[f'matching_{real}'] = df_filtered[real] == df_filtered[pred]
36
+
37
+ for real, pred in real_pred_columns.items():
38
+ print(f"Evaluations for {real}:")
39
+ for idx, row in df_filtered.iterrows():
40
+ if ((row['year'] == 2024) | pd.isna(row["url"]) | (row["url"] == "") | (pd.isna(row[real]))):
41
+ continue
42
+
43
+ if not(row[f'matching_{real}']):
44
+ print(f"Automated test for {real} failed for link: {row['url']} [{row[real]} - {row[pred]}]")
plotting/result_plots.py CHANGED
@@ -1,25 +1,66 @@
1
  import plotly.express as px
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  df = pd.read_csv('data/results.csv', sep="\t")
 
 
 
 
 
 
 
 
5
  custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
6
 
7
- # Calculate total number of URLs per year and venue
8
- total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Calculate the number of URLs with errors per year and venue
11
- errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')
12
 
13
- # Merge the DataFrames to calculate the error rate
14
- error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
15
- error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
16
- error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
17
 
18
  # Plot the error rates using Plotly, with year on x-axis and color by venue
19
  fig = px.bar(
20
- error_rate_df,
21
  x='year',
22
- y='error_rate',
23
  color='venue',
24
  barmode='group',
25
  title=f'Success Rate per Venue and Year for "valid_url"',
@@ -32,48 +73,53 @@ fig.update_xaxes(range=[2017.5, 2024.5])
32
  fig.show()
33
 
34
 
35
- for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
36
- # Calculate total number of URLs per year and venue
37
- total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls')
 
 
 
 
 
 
38
 
39
- # Calculate the number of URLs with errors per year and venue
40
- passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')
41
 
42
- # Merge the DataFrames to calculate the error rate
43
- success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left')
44
- success_rate_df['successes'] = success_rate_df['successes'].fillna(0) # Replace NaN with 0 for venues with no errors
45
- success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']
46
 
47
- # Plot the error rates using Plotly, with year on x-axis and color by venue
48
- fig = px.bar(
49
- success_rate_df,
50
- x='year',
51
- y='success_rate',
52
- color='venue',
53
- barmode='group',
54
- title=f'Success Rate per Venue and Year for "{topic}"',
55
- labels={'error_rate': 'Success Rate', 'year': 'Year'},
56
- category_orders={'venue': custom_order}
57
- )
58
 
59
- fig.update_yaxes(range=[0, 1])
60
- fig.update_xaxes(range=[2017.5, 2024.5])
61
- fig.show()
 
 
62
 
 
 
 
63
 
64
  # List of columns to check for "No"
65
- columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
 
 
 
66
 
67
  # Step 1: Calculate the number of "No" answers per row for the specified columns
68
- df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
69
 
70
  # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
71
  fig = px.scatter(
72
- df,
73
  x='pred_citations',
74
  y='no_count',
75
  color='venue',
76
- title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue',
77
  labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
78
  category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
79
  log_x=True
@@ -82,19 +128,16 @@ fig = px.scatter(
82
  # Step 3: Display the scatter plot
83
  fig.show()
84
 
85
- # List of columns to check for "No"
86
- columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
87
-
88
- # Step 1: Calculate the number of "No" answers per row for the specified columns
89
- df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
90
 
91
  # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
92
  fig = px.strip(
93
- df,
94
  x='venue',
95
  y='no_count',
96
  color='venue',
97
- title='Individual "No" Scores with Jitter per Venue',
98
  labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
99
  category_orders={'venue': custom_order}, # Ensure custom order for venues
100
  stripmode='overlay' # Allows all individual points to overlay each other
@@ -105,7 +148,7 @@ fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))
105
 
106
  # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
107
  fig.add_trace(px.box(
108
- df,
109
  x='venue',
110
  y='no_count',
111
  category_orders={'venue': custom_order}
@@ -114,28 +157,38 @@ fig.add_trace(px.box(
114
  # Step 5: Show the plot
115
  fig.show()
116
 
117
- for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
118
- # Calculate total number of URLs per venue
119
- total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls')
120
-
121
- # Calculate the number of URLs with errors per venue
122
- errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors')
123
-
124
- # Merge the DataFrames to calculate the error rate
125
- error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left')
126
- error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
127
- error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
128
-
129
- # Plot the error rates using Plotly, with venue on x-axis
130
- fig = px.bar(
131
- error_rate_df,
132
- x='venue',
133
- y='error_rate',
134
- color='venue',
135
- title=f'Success Rate per Venue for "{topic}"',
136
- labels={'error_rate': 'Success Rate', 'venue': 'Venue'},
137
- category_orders={'venue': custom_order}
138
- )
139
-
140
- fig.update_yaxes(range=[0, 1])
141
- fig.show()
 
 
 
 
 
 
 
 
 
 
 
1
  import plotly.express as px
2
  import pandas as pd
3
+ import re
4
+
5
+ # Define columns for all relevant predictions
6
+ pred_columns = ['pred_dependencies', 'pred_training',
7
+ 'pred_evaluation', 'pred_weights', 'pred_readme',
8
+ 'pred_license']
9
+
10
+ # Define the real and predicted column pairs
11
+ real_pred_columns = {
12
+ 'dependencies': 'pred_dependencies',
13
+ 'training': 'pred_training',
14
+ 'evaluation': 'pred_evaluation',
15
+ 'weights': 'pred_weights',
16
+ 'readme': 'pred_readme',
17
+ 'license': 'pred_license'
18
+ }
19
 
20
  df = pd.read_csv('data/results.csv', sep="\t")
21
+
22
+ # Cleanup
23
+ df['year'] = pd.to_numeric(df['year'], errors='coerce')
24
+ df = df.dropna(subset=['year'])
25
+ df['year'] = df['year'].astype(int)
26
+
27
+ # df['venue'] = df['venue'].apply(lambda x: str(re.search(r"'(.*?)'", x).group(1)))
28
+
29
  custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
30
 
31
+ # Group by year and venue, and calculate the ratio of papers where URL is not None
32
+ df_grouped = df.groupby(['year', 'venue']).agg(
33
+ total_papers=('title', 'count'),
34
+ papers_with_url=('url', lambda x: x.notna().sum()),
35
+ valid_urls=('pred_live', lambda x: (x == "Yes").sum())
36
+ ).reset_index()
37
+
38
+ df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']
39
+
40
+ # Create the plotly figure
41
+ fig = px.bar(
42
+ df_grouped,
43
+ x='year',
44
+ y='ratio',
45
+ color='venue',
46
+ barmode='group',
47
+ title=f'Success Rate per Venue and Year for "valid_url"',
48
+ labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},
49
+ category_orders={'venue': custom_order}
50
+ )
51
+
52
+ fig.update_yaxes(range=[0, 1])
53
+ fig.update_xaxes(range=[2017.5, 2024.5])
54
+ fig.show()
55
 
56
+ df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']
 
57
 
 
 
 
 
58
 
59
  # Plot the error rates using Plotly, with year on x-axis and color by venue
60
  fig = px.bar(
61
+ df_grouped,
62
  x='year',
63
+ y='valid_ratio',
64
  color='venue',
65
  barmode='group',
66
  title=f'Success Rate per Venue and Year for "valid_url"',
 
73
  fig.show()
74
 
75
 
76
+ # Ensure boolean columns are actually booleans
77
+ df_new = df.copy()
78
+ for col in pred_columns:
79
+ df_new[col] = df_new[col] == "Yes"
80
+
81
+ df_grouped = df_new.groupby('venue').agg(
82
+ valid_urls=('pred_live', lambda x: (x == "Yes").sum()),
83
+ **{col: (col, lambda x: x[df_new['pred_live'] == "Yes"].sum()) for col in pred_columns}
84
+ ).reset_index()
85
 
 
 
86
 
87
+ # Calculate the ratio for each prediction column
88
+ for col in pred_columns:
89
+ df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']
 
90
 
91
+ # Melt the dataframe for easier plotting
92
+ df_melted = df_grouped.melt(id_vars=['venue'],
93
+ value_vars=pred_columns,
94
+ var_name='Prediction Type',
95
+ value_name='Ratio')
 
 
 
 
 
 
96
 
97
+ # Create a grouped bar plot
98
+ fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',
99
+ barmode='group', # Ensures bars are side by side
100
+ category_orders={'venue': custom_order},
101
+ title='Ratio of Predictions by Venue')
102
 
103
+ # Show the figure
104
+ fig.update_yaxes(range=[0, 1])
105
+ fig.show()
106
 
107
  # List of columns to check for "No"
108
+ # Step 1: Filter only rows where pred_live is "Yes"
109
+ df_filtered = df[df['pred_live'] == "Yes"].copy()
110
+ for col in pred_columns:
111
+ df_filtered[col] = df_filtered[col] == "Yes"
112
 
113
  # Step 1: Calculate the number of "No" answers per row for the specified columns
114
+ df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)
115
 
116
  # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
117
  fig = px.scatter(
118
+ df_filtered,
119
  x='pred_citations',
120
  y='no_count',
121
  color='venue',
122
+ title='Number of passed tests, Color Coded by Venue',
123
  labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
124
  category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
125
  log_x=True
 
128
  # Step 3: Display the scatter plot
129
  fig.show()
130
 
131
+ # [np.corrcoef(np.array(df_filtered[col][~(pd.isna(df_filtered['pred_citations']))], dtype=int), df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])[0, 1] for col in pred_columns]
132
+ # np.corrcoef(np.array(df_filtered['no_count'][~(pd.isna(df_filtered['pred_citations']))]), (1 + np.array(df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])))
 
 
 
133
 
134
  # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
135
  fig = px.strip(
136
+ df_filtered,
137
  x='venue',
138
  y='no_count',
139
  color='venue',
140
+ title='Automated Reproducibility Score per Venue',
141
  labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
142
  category_orders={'venue': custom_order}, # Ensure custom order for venues
143
  stripmode='overlay' # Allows all individual points to overlay each other
 
148
 
149
  # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
150
  fig.add_trace(px.box(
151
+ df_filtered,
152
  x='venue',
153
  y='no_count',
154
  category_orders={'venue': custom_order}
 
157
  # Step 5: Show the plot
158
  fig.show()
159
 
160
+ # Group by venue
161
+ df_filtered = df[df['pred_live'] == "Yes"].copy()
162
+ df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
163
+ df_grouped = df_filtered.groupby('venue').agg(
164
+ total_papers=('title', 'count')
165
+ ).reset_index()
166
+
167
+ # Add matching counts for each category
168
+ for real, pred in real_pred_columns.items():
169
+ df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)
170
+
171
+ # Compute the ratio for each category
172
+ for real in real_pred_columns.keys():
173
+ df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']
174
+
175
+ # Melt the dataframe for visualization
176
+ df_melted = df_grouped.melt(id_vars=['venue'],
177
+ value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()],
178
+ var_name='Category',
179
+ value_name='Ratio')
180
+
181
+ # Clean up category names
182
+ df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()
183
+
184
+ # Create the bar plot
185
+ fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',
186
+ barmode='group',
187
+ title='Ratio of Matching Real vs Predicted Categories by Venue',
188
+ labels={'Ratio': 'Ratio of Matches'})
189
+
190
+ # Ensure y-axis range is between 0 and 1
191
+ fig.update_yaxes(range=[0, 1])
192
+
193
+ # Show the figure
194
+ fig.show()
plotting/results.ipynb ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "vscode": {
8
+ "languageId": "plaintext"
9
+ }
10
+ },
11
+ "outputs": [],
12
+ "source": [
13
+ "import plotly.express as px\n",
14
+ "import pandas as pd\n",
15
+ "import re\n",
16
+ "\n",
17
+ "# Define columns for all relevant predictions\n",
18
+ "pred_columns = ['pred_dependencies', 'pred_training', \n",
19
+ " 'pred_evaluation', 'pred_weights', 'pred_readme', \n",
20
+ " 'pred_license']\n",
21
+ "\n",
22
+ "# Define the real and predicted column pairs\n",
23
+ "real_pred_columns = {\n",
24
+ " 'dependencies': 'pred_dependencies',\n",
25
+ " 'training': 'pred_training',\n",
26
+ " 'evaluation': 'pred_evaluation',\n",
27
+ " 'weights': 'pred_weights',\n",
28
+ " 'readme': 'pred_readme',\n",
29
+ " 'license': 'pred_license'\n",
30
+ "}\n"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {
37
+ "vscode": {
38
+ "languageId": "plaintext"
39
+ }
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "\n",
44
+ "df = pd.read_csv('data/results.csv', sep=\"\\t\")\n",
45
+ "\n",
46
+ "# Cleanup\n",
47
+ "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n",
48
+ "df = df.dropna(subset=['year'])\n",
49
+ "df['year'] = df['year'].astype(int)\n",
50
+ "\n",
51
+ "df['venue'] = df['venue'].apply(lambda x: str(re.search(r\"'(.*?)'\", x).group(1)))\n",
52
+ "\n",
53
+ "custom_order = [\"MICCAI\", \"MIDL\", \"Nature\", \"arXiv\"]\n",
54
+ "\n",
55
+ "# Group by year and venue, and calculate the ratio of papers where URL is not None\n",
56
+ "df_grouped = df.groupby(['year', 'venue']).agg(\n",
57
+ " total_papers=('title', 'count'),\n",
58
+ " papers_with_url=('url', lambda x: x.notna().sum()),\n",
59
+ " valid_urls=('pred_live', lambda x: (x == \"Yes\").sum())\n",
60
+ ").reset_index()\n",
61
+ "\n",
62
+ "df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']\n",
63
+ "\n",
64
+ "# Create the plotly figure\n",
65
+ "fig = px.bar(\n",
66
+ " df_grouped,\n",
67
+ " x='year',\n",
68
+ " y='ratio',\n",
69
+ " color='venue',\n",
70
+ " barmode='group',\n",
71
+ " title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
72
+ " labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},\n",
73
+ " category_orders={'venue': custom_order}\n",
74
+ ")\n",
75
+ "\n",
76
+ "fig.update_yaxes(range=[0, 1])\n",
77
+ "fig.update_xaxes(range=[2017.5, 2024.5])\n",
78
+ "fig.show()\n",
79
+ "\n",
80
+ "df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']\n",
81
+ "\n",
82
+ "\n",
83
+ "# Plot the error rates using Plotly, with year on x-axis and color by venue\n",
84
+ "fig = px.bar(\n",
85
+ " df_grouped,\n",
86
+ " x='year',\n",
87
+ " y='valid_ratio',\n",
88
+ " color='venue',\n",
89
+ " barmode='group',\n",
90
+ " title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
91
+ " labels={'error_rate': 'Success Rate', 'year': 'Year'},\n",
92
+ " category_orders={'venue': custom_order}\n",
93
+ ")\n",
94
+ "\n",
95
+ "fig.update_yaxes(range=[0, 1])\n",
96
+ "fig.update_xaxes(range=[2017.5, 2024.5])\n",
97
+ "fig.show()\n",
98
+ "\n",
99
+ "\n",
100
+ "# Ensure boolean columns are actually booleans\n",
101
+ "df_new = df.copy()\n",
102
+ "for col in pred_columns:\n",
103
+ " df_new[col] = df_new[col] == \"Yes\"\n",
104
+ "\n",
105
+ "df_grouped = df_new.groupby('venue').agg(\n",
106
+ " valid_urls=('pred_live', lambda x: (x == \"Yes\").sum()),\n",
107
+ " **{col: (col, lambda x: x[df_new['pred_live'] == \"Yes\"].sum()) for col in pred_columns} \n",
108
+ ").reset_index()\n",
109
+ "\n",
110
+ "\n",
111
+ "# Calculate the ratio for each prediction column\n",
112
+ "for col in pred_columns:\n",
113
+ " df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']\n",
114
+ "\n",
115
+ "# Melt the dataframe for easier plotting\n",
116
+ "df_melted = df_grouped.melt(id_vars=['venue'], \n",
117
+ " value_vars=pred_columns, \n",
118
+ " var_name='Prediction Type', \n",
119
+ " value_name='Ratio')\n",
120
+ "\n",
121
+ "# Create a grouped bar plot\n",
122
+ "fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',\n",
123
+ " barmode='group', # Ensures bars are side by side\n",
124
+ " category_orders={'venue': custom_order},\n",
125
+ " title='Ratio of Predictions by Venue')\n",
126
+ "\n",
127
+ "# Show the figure\n",
128
+ "fig.update_yaxes(range=[0, 1])\n",
129
+ "fig.show()\n",
130
+ "\n",
131
+ "# List of columns to check for \"No\"\n",
132
+ "# Step 1: Filter only rows where pred_live is \"Yes\"\n",
133
+ "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
134
+ "for col in pred_columns:\n",
135
+ " df_filtered[col] = df_filtered[col] == \"Yes\"\n",
136
+ "\n",
137
+ "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
138
+ "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
139
+ "\n",
140
+ "# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue\n",
141
+ "fig = px.scatter(\n",
142
+ " df_filtered,\n",
143
+ " x='pred_citations',\n",
144
+ " y='no_count',\n",
145
+ " color='venue',\n",
146
+ " title='Number of passed tests, Color Coded by Venue',\n",
147
+ " labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},\n",
148
+ " category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary\n",
149
+ " log_x=True\n",
150
+ ")\n",
151
+ "\n",
152
+ "# Step 3: Display the scatter plot\n",
153
+ "fig.show()\n",
154
+ "\n",
155
+ "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
156
+ "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
157
+ "\n",
158
+ "# Step 2: Create a strip plot (scatter-like) with jitter to show individual \"No\" counts\n",
159
+ "fig = px.strip(\n",
160
+ " df_filtered,\n",
161
+ " x='venue',\n",
162
+ " y='no_count',\n",
163
+ " color='venue',\n",
164
+ " title='Automated Reproducibility Score per Venue',\n",
165
+ " labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},\n",
166
+ " category_orders={'venue': custom_order}, # Ensure custom order for venues\n",
167
+ " stripmode='overlay' # Allows all individual points to overlay each other\n",
168
+ ")\n",
169
+ "\n",
170
+ "# Step 3: Add some jitter to the x-axis so points don't overlap\n",
171
+ "fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))\n",
172
+ "\n",
173
+ "# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread\n",
174
+ "fig.add_trace(px.box(\n",
175
+ " df_filtered,\n",
176
+ " x='venue',\n",
177
+ " y='no_count',\n",
178
+ " category_orders={'venue': custom_order}\n",
179
+ ").data[0]) # We add the first trace of the box plot to overlay\n",
180
+ "\n",
181
+ "# Step 5: Show the plot\n",
182
+ "fig.show()\n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {
189
+ "vscode": {
190
+ "languageId": "plaintext"
191
+ }
192
+ },
193
+ "outputs": [],
194
+ "source": [
195
+ "\n",
196
+ "# Group by venue\n",
197
+ "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
198
+ "df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == \"No\") | (pd.isna(row))) else \"Yes\")\n",
199
+ "df_grouped = df_filtered.groupby('venue').agg(\n",
200
+ " total_papers=('title', 'count')\n",
201
+ ").reset_index()\n",
202
+ "\n",
203
+ "# Add matching counts for each category\n",
204
+ "for real, pred in real_pred_columns.items():\n",
205
+ " df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)\n",
206
+ "\n",
207
+ "# Compute the ratio for each category\n",
208
+ "for real in real_pred_columns.keys():\n",
209
+ " df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']\n",
210
+ "\n",
211
+ "# Melt the dataframe for visualization\n",
212
+ "df_melted = df_grouped.melt(id_vars=['venue'], \n",
213
+ " value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], \n",
214
+ " var_name='Category', \n",
215
+ " value_name='Ratio')\n",
216
+ "\n",
217
+ "# Clean up category names\n",
218
+ "df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()\n",
219
+ "\n",
220
+ "# Create the bar plot\n",
221
+ "fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',\n",
222
+ " barmode='group', \n",
223
+ " title='Ratio of Matching Real vs Predicted Categories by Venue',\n",
224
+ " labels={'Ratio': 'Ratio of Matches'})\n",
225
+ "\n",
226
+ "# Ensure y-axis range is between 0 and 1\n",
227
+ "fig.update_yaxes(range=[0, 1])\n",
228
+ "\n",
229
+ "# Show the figure\n",
230
+ "fig.show()"
231
+ ]
232
+ }
233
+ ],
234
+ "metadata": {
235
+ "language_info": {
236
+ "name": "python"
237
+ }
238
+ },
239
+ "nbformat": 4,
240
+ "nbformat_minor": 2
241
+ }
plotting/urls.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import ast
4
+ import os
5
+ import sys
6
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
7
+ sys.path.append(ROOT_DIR)
8
+ from core.paper import Paper
9
+
10
+ df = pd.read_csv('data/urls.csv', sep="\t")
11
+ success = 0
12
+ total = 0
13
+ papers = [Paper.from_row(row) for _, row in df.iterrows()]
14
+
15
+ def normalize_url(url):
16
+ return url.strip().lower().rstrip("/")
17
+
18
+ tp, fp, fn = 0, 0, 0
19
+ for paper in papers:
20
+ if (paper.venue == "MICCAI"):
21
+ continue
22
+
23
+ urls_auto = [normalize_url(u) for u in paper.urls_auto]
24
+ urls_manual = [normalize_url(u) for u in paper.urls_manual]
25
+
26
+ auto_set = set(urls_auto)
27
+ manual_set = set(urls_manual)
28
+
29
+ tp += len(auto_set & manual_set)
30
+ fp += len(auto_set - manual_set)
31
+ fn += len(manual_set - auto_set)
32
+
33
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
34
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
35
+
36
+ print(f"Precision: {precision:.3f}")
37
+ print(f"Recall: {recall:.3f}")