Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Attila Simkó
commited on
Commit
·
2db37b1
1
Parent(s):
3cfadc8
big upgrade
Browse files- .gitignore +1 -7
- app.py +7 -3
- config/__pycache__/constants.cpython-312.pyc +0 -0
- config/constants.py +12 -0
- core/__pycache__/conversion.cpython-312.pyc +0 -0
- core/__pycache__/paper.cpython-312.pyc +0 -0
- core/conversion.py +205 -0
- core/paper.py +160 -0
- data/dump.csv +0 -0
- data/fetch_miccai.py +0 -60
- data/fetch_processed.py +0 -31
- data/fetch_zips.py +0 -43
- data/zipfiles.csv +0 -0
- data_generation/fetch_processed.py +123 -0
- {data → data_generation/paper_scraping}/fetch_arxiv.py +63 -49
- data_generation/paper_scraping/fetch_miccai.py +89 -0
- {data → data_generation/paper_scraping}/fetch_nature.py +18 -18
- evaluations/documentation.py +33 -31
- evaluations/license.py +12 -12
- evaluations/pitfalls.py +6 -5
- evaluations/repo_evaluations.py +52 -65
- evaluations/requirements.py +15 -13
- evaluations/training.py +10 -10
- evaluations/url.py +86 -0
- evaluations/utils.py +13 -84
- evaluations/validating.py +10 -9
- evaluations/weights.py +44 -43
- full_eval.py +18 -3
- midl_summary.py +0 -57
- plotting/midl_summary.py +59 -0
- plotting/paper_plots.py +26 -48
- plotting/print_incorrect.py +44 -0
- plotting/result_plots.py +123 -70
- plotting/results.ipynb +241 -0
- plotting/urls.py +37 -0
.gitignore
CHANGED
@@ -1,10 +1,4 @@
|
|
1 |
-
data/
|
2 |
-
data/MICCAI.csv
|
3 |
-
data/arXiv.csv
|
4 |
-
data/Nature.csv
|
5 |
-
data/results.csv
|
6 |
-
data/*.zip
|
7 |
-
data/test/*
|
8 |
*.env
|
9 |
.env
|
10 |
evaluations/__pycache__/*
|
|
|
1 |
+
data/
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
*.env
|
3 |
.env
|
4 |
evaluations/__pycache__/*
|
app.py
CHANGED
@@ -2,8 +2,11 @@ import streamlit as st
|
|
2 |
from evaluations.repo_evaluations import evaluate
|
3 |
from evaluations.models import RemoteLLM
|
4 |
import requests
|
|
|
|
|
|
|
5 |
|
6 |
-
model = RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
|
7 |
|
8 |
st.write("\n")
|
9 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
@@ -12,10 +15,11 @@ st.write("Additionally we look for common pitfalls in the code according to a pu
|
|
12 |
|
13 |
checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
|
14 |
repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
|
15 |
-
|
16 |
if (repo_link):
|
17 |
verbose = 4 if checkbox else 3
|
18 |
-
|
|
|
|
|
19 |
|
20 |
with st.form("my_form"):
|
21 |
st.write("Notice something wrong? Please tell us so we can improve.")
|
|
|
2 |
from evaluations.repo_evaluations import evaluate
|
3 |
from evaluations.models import RemoteLLM
|
4 |
import requests
|
5 |
+
from core.paper import Paper
|
6 |
+
from core.conversion import fetch_repo
|
7 |
+
import os
|
8 |
|
9 |
+
model = None # RemoteLLM("meta-llama/Llama-3.1-8B-Instruct")
|
10 |
|
11 |
st.write("\n")
|
12 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
|
|
15 |
|
16 |
checkbox = st.checkbox("Would you like to see recommendations during evaluation?", value=False)
|
17 |
repo_link = st.text_input("Github repository link:", value="", type="default", help=None)
|
|
|
18 |
if (repo_link):
|
19 |
verbose = 4 if checkbox else 3
|
20 |
+
paper = Paper.from_url(repo_link, verbose=verbose)
|
21 |
+
fetch_repo(0, paper.main_repo_url, paper.zip_path, os.getenv("githubToken"))
|
22 |
+
evaluate(model, paper, paper.log)
|
23 |
|
24 |
with st.form("my_form"):
|
25 |
st.write("Notice something wrong? Please tell us so we can improve.")
|
config/__pycache__/constants.cpython-312.pyc
ADDED
Binary file (684 Bytes). View file
|
|
config/constants.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum, auto
|
2 |
+
|
3 |
+
VENUE_ORDER = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
4 |
+
MIDL_COLORS = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000", "#FF862C", "#800080"]
|
5 |
+
|
6 |
+
class LogType(Enum):
|
7 |
+
TITLE = "TITLE"
|
8 |
+
LOG = "LOG"
|
9 |
+
ERROR = "ERROR"
|
10 |
+
WARNING = "WARNING"
|
11 |
+
NOTE = "NOTE"
|
12 |
+
INFO = "INFO"
|
core/__pycache__/conversion.cpython-312.pyc
ADDED
Binary file (10.1 kB). View file
|
|
core/__pycache__/paper.cpython-312.pyc
ADDED
Binary file (8.64 kB). View file
|
|
core/conversion.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from urllib.request import urlretrieve
|
3 |
+
import requests
|
4 |
+
import random
|
5 |
+
import time
|
6 |
+
import pandas as pd
|
7 |
+
import xml.etree.ElementTree as ET
|
8 |
+
from tqdm import tqdm
|
9 |
+
import os
|
10 |
+
import requests
|
11 |
+
from tqdm import tqdm
|
12 |
+
import xml.etree.ElementTree as ET
|
13 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
16 |
+
import torch
|
17 |
+
import numpy as np
|
18 |
+
import faiss
|
19 |
+
|
20 |
+
|
21 |
+
token = os.getenv("githubToken")
|
22 |
+
|
23 |
+
def noop_logger(*args, **kwargs):
|
24 |
+
pass
|
25 |
+
|
26 |
+
def download_pdf(paper, max_retries=3):
|
27 |
+
if pd.isna(paper.pdf_url):
|
28 |
+
paper.log("ERROR", "Missing PDF URL")
|
29 |
+
return paper
|
30 |
+
|
31 |
+
pdf_path = paper.pdf_path
|
32 |
+
if (os.path.exists(pdf_path)):
|
33 |
+
return paper
|
34 |
+
|
35 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
36 |
+
|
37 |
+
for attempt in range(max_retries):
|
38 |
+
try:
|
39 |
+
response = requests.get(paper.pdf_url, headers=headers)
|
40 |
+
if response.status_code == 200:
|
41 |
+
with open(pdf_path, "wb") as f:
|
42 |
+
f.write(response.content)
|
43 |
+
|
44 |
+
time.sleep(random.uniform(1.0, 3.0))
|
45 |
+
return paper
|
46 |
+
elif response.status_code == 429:
|
47 |
+
wait = 2 ** attempt
|
48 |
+
paper.log("WARNING", f"Rate limited, retrying in {wait}s...")
|
49 |
+
time.sleep(wait)
|
50 |
+
else:
|
51 |
+
paper.log("ERROR", f"Download failed: HTTP {response.status_code}")
|
52 |
+
break
|
53 |
+
except Exception as e:
|
54 |
+
paper.log("ERROR", f"Download error: {e}")
|
55 |
+
time.sleep(1)
|
56 |
+
|
57 |
+
return paper
|
58 |
+
|
59 |
+
def get_api_link(url):
|
60 |
+
username, repo_name = decompose_url(url)
|
61 |
+
if (username == None):
|
62 |
+
return ""
|
63 |
+
return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
|
64 |
+
|
65 |
+
def decompose_url(url):
|
66 |
+
try:
|
67 |
+
url = url.split("github.com")[1]
|
68 |
+
url = url.strip(".")
|
69 |
+
url = url.split(".git")[0]
|
70 |
+
url = url.strip("/")
|
71 |
+
parts = url.split("/")
|
72 |
+
username = parts[0]
|
73 |
+
repo_name = parts[1]
|
74 |
+
return username, repo_name
|
75 |
+
except:
|
76 |
+
return None, None
|
77 |
+
|
78 |
+
def fetch_repo(verbose, repo_url, repo_name, token, force_download=False):
|
79 |
+
if (os.path.exists(repo_name)):
|
80 |
+
if (force_download):
|
81 |
+
os.remove(repo_name)
|
82 |
+
else:
|
83 |
+
return
|
84 |
+
|
85 |
+
if ("github.com" not in repo_url):
|
86 |
+
return ValueError(f"URL not for github repo, please evaluate manually ({repo_url}).")
|
87 |
+
|
88 |
+
headers = {"Authorization": f"token {token}"}
|
89 |
+
api_url = get_api_link(repo_url)
|
90 |
+
|
91 |
+
if (api_url == ""):
|
92 |
+
return ValueError(f"Failed to parse the URL, please evaluate manually ({repo_url}).")
|
93 |
+
|
94 |
+
# Sending GET request to GitHub API
|
95 |
+
response = requests.get(api_url, headers=headers)
|
96 |
+
|
97 |
+
if response.status_code == 200:
|
98 |
+
with open(repo_name, 'wb') as file:
|
99 |
+
file.write(response.content)
|
100 |
+
if (response.status_code == 404):
|
101 |
+
return ValueError("Repository private / Link broken.")
|
102 |
+
|
103 |
+
def download_repo(paper):
|
104 |
+
try:
|
105 |
+
if (paper.main_repo_url is None):
|
106 |
+
return
|
107 |
+
|
108 |
+
fetch_repo(0, paper.main_repo_url, paper.zip_path, token)
|
109 |
+
except Exception as e:
|
110 |
+
paper.log("ERROR", f"Repo download failed: {e}")
|
111 |
+
return paper
|
112 |
+
|
113 |
+
|
114 |
+
def pdf_to_grobid(filename, save_path=None, grobid_url="https://attilasimko-grobid.hf.space/"):
|
115 |
+
"""
|
116 |
+
Convert a PDF to Grobid XML.
|
117 |
+
|
118 |
+
Parameters:
|
119 |
+
filename (str or list): Path to the PDF file or list of PDF files.
|
120 |
+
save_path (str, optional): Directory or file path to save to. Defaults to the current directory.
|
121 |
+
grobid_url (str, optional): URL of the Grobid server. Defaults to public server.
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
str or list: Path(s) to the saved XML file(s) or parsed XML object if saved to a temp file.
|
125 |
+
"""
|
126 |
+
|
127 |
+
def is_server_up(url):
|
128 |
+
try:
|
129 |
+
response = requests.get(url + "/api/health", timeout=5)
|
130 |
+
return response.status_code == 200
|
131 |
+
except requests.RequestException:
|
132 |
+
return False
|
133 |
+
|
134 |
+
if not is_server_up(grobid_url):
|
135 |
+
raise ConnectionError(f"The Grobid server {grobid_url} is not available.")
|
136 |
+
|
137 |
+
# Handle multiple files
|
138 |
+
if isinstance(filename, list):
|
139 |
+
if save_path is None or not os.path.isdir(save_path):
|
140 |
+
print(f"Warning: {save_path} is not a directory. PDFs will be saved in the current directory: {os.getcwd()}")
|
141 |
+
save_path = "."
|
142 |
+
|
143 |
+
xmls = []
|
144 |
+
for pdf in tqdm(filename, desc="Processing PDFs"):
|
145 |
+
try:
|
146 |
+
xml = pdf_to_grobid(pdf, save_path, grobid_url)
|
147 |
+
xmls.append(xml)
|
148 |
+
except Exception as e:
|
149 |
+
print(f"Error processing {pdf}: {e}")
|
150 |
+
xmls.append(None)
|
151 |
+
|
152 |
+
return xmls
|
153 |
+
|
154 |
+
# Handle directory input
|
155 |
+
if os.path.isdir(filename):
|
156 |
+
pdfs = [os.path.join(filename, f) for f in os.listdir(filename) if f.endswith(".pdf")]
|
157 |
+
if not pdfs:
|
158 |
+
print(f"Warning: No PDF files found in directory {filename}")
|
159 |
+
return pdf_to_grobid(pdfs, save_path, grobid_url)
|
160 |
+
|
161 |
+
# Ensure file exists
|
162 |
+
if not os.path.isfile(filename):
|
163 |
+
raise FileNotFoundError(f"The file {filename} does not exist.")
|
164 |
+
|
165 |
+
# Send PDF to Grobid
|
166 |
+
with open(filename, "rb") as file:
|
167 |
+
files = {"input": file}
|
168 |
+
post_url = f"{grobid_url}/api/processFulltextDocument"
|
169 |
+
response = requests.post(post_url, files=files)
|
170 |
+
|
171 |
+
if response.status_code != 200:
|
172 |
+
raise Exception(f"Error: {response.reason}")
|
173 |
+
|
174 |
+
# Determine save path
|
175 |
+
if save_path is None:
|
176 |
+
save_file = os.path.join(os.getcwd(), "temp_grobid.xml")
|
177 |
+
elif os.path.isdir(save_path):
|
178 |
+
base_name = os.path.splitext(os.path.basename(filename))[0] + ".xml"
|
179 |
+
save_file = os.path.join(save_path, base_name)
|
180 |
+
else:
|
181 |
+
save_file = save_path if save_path.endswith(".xml") else save_path + ".xml"
|
182 |
+
|
183 |
+
# Save the response
|
184 |
+
with open(save_file, "wb") as f:
|
185 |
+
f.write(response.content)
|
186 |
+
|
187 |
+
# Return XML object if saved to temp file
|
188 |
+
if save_path is None:
|
189 |
+
return ET.parse(save_file).getroot()
|
190 |
+
else:
|
191 |
+
return save_file
|
192 |
+
|
193 |
+
def extract_body(xml_root):
|
194 |
+
"""Extracts and returns the text content of the paper's body from Grobid XML."""
|
195 |
+
namespace = {"tei": "http://www.tei-c.org/ns/1.0"} # Define TEI namespace
|
196 |
+
body_text = []
|
197 |
+
|
198 |
+
# Locate <body> in the XML structure
|
199 |
+
body = xml_root.find(".//tei:body", namespace)
|
200 |
+
if body is not None:
|
201 |
+
for p in body.findall(".//tei:p", namespace): # Get all paragraphs inside <body>
|
202 |
+
if p.text:
|
203 |
+
body_text.append(p.text.strip())
|
204 |
+
|
205 |
+
return "\n".join(body_text)
|
core/paper.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/paper.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import uuid
|
5 |
+
import re
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
import pdfplumber
|
8 |
+
import hashlib
|
9 |
+
import json
|
10 |
+
import pandas as pd
|
11 |
+
from config.constants import LogType
|
12 |
+
import ast
|
13 |
+
import streamlit as st
|
14 |
+
import datetime
|
15 |
+
from enum import Enum
|
16 |
+
|
17 |
+
def _parse_url_field(value):
|
18 |
+
if isinstance(value, list):
|
19 |
+
return value
|
20 |
+
if isinstance(value, str):
|
21 |
+
try:
|
22 |
+
parsed = ast.literal_eval(value)
|
23 |
+
return parsed if isinstance(parsed, list) else [parsed]
|
24 |
+
except Exception:
|
25 |
+
return [value]
|
26 |
+
if pd.isna(value):
|
27 |
+
return []
|
28 |
+
return [value]
|
29 |
+
|
30 |
+
class Paper:
|
31 |
+
def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], verbose=0):
|
32 |
+
# Metadata
|
33 |
+
self.title = title
|
34 |
+
self.venue = venue
|
35 |
+
self.year = year
|
36 |
+
self.pdf_url = pdf_url
|
37 |
+
|
38 |
+
|
39 |
+
# Optional ground truth links (e.g., from curated metadata)
|
40 |
+
|
41 |
+
self.urls_manual = _parse_url_field(urls_manual)
|
42 |
+
self.urls_auto = _parse_url_field(urls_auto)
|
43 |
+
|
44 |
+
self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id
|
45 |
+
self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf"
|
46 |
+
self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml"
|
47 |
+
self.zip_path = None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip"
|
48 |
+
|
49 |
+
# Internal state
|
50 |
+
self.logs = logs
|
51 |
+
self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual
|
52 |
+
self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto
|
53 |
+
self.verbose = verbose
|
54 |
+
|
55 |
+
def __repr__(self):
|
56 |
+
return f"<Paper: {self.title}>"
|
57 |
+
|
58 |
+
@classmethod
|
59 |
+
def from_url(cls, code_url, verbose):
|
60 |
+
# Supports both dicts and pandas Series
|
61 |
+
return cls(
|
62 |
+
urls_manual=code_url,
|
63 |
+
verbose=verbose
|
64 |
+
|
65 |
+
)
|
66 |
+
|
67 |
+
@classmethod
|
68 |
+
def from_raw(cls, row):
|
69 |
+
# Supports both dicts and pandas Series
|
70 |
+
return cls(
|
71 |
+
title=row.get("Title", ""),
|
72 |
+
venue=row.get("Venue", ""),
|
73 |
+
year=row.get("Year", ""),
|
74 |
+
pdf_url=row.get('PDF'),
|
75 |
+
urls_manual=row.get("Repository"),
|
76 |
+
code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")}
|
77 |
+
)
|
78 |
+
|
79 |
+
@classmethod
|
80 |
+
def from_row(cls, row):
|
81 |
+
# Supports both dicts and pandas Series
|
82 |
+
return cls(
|
83 |
+
title=row.get("title", ""),
|
84 |
+
venue=row.get("venue", ""),
|
85 |
+
year=row.get("year", ""),
|
86 |
+
paper_id=row.get('paper_id'),
|
87 |
+
pdf_url=row.get('pdf_url'),
|
88 |
+
urls_manual=json.loads(row.get("urls_manual")),
|
89 |
+
urls_auto=json.loads(row.get("urls_auto")),
|
90 |
+
code_repro_manual=json.loads(row.get("code_reproducibility_manual")),
|
91 |
+
code_repro_auto=json.loads(row.get("code_reproducibility_auto")),
|
92 |
+
logs=json.loads(row.get("logs", "[]"))
|
93 |
+
)
|
94 |
+
|
95 |
+
@property
|
96 |
+
def main_repo_url(self):
|
97 |
+
urls = [*self.urls_manual, *self.urls_auto]
|
98 |
+
github_links = [u for u in urls if "github.com" in u]
|
99 |
+
return github_links[0] if github_links else None
|
100 |
+
|
101 |
+
def _compute_id(self):
|
102 |
+
paper_name = self.title
|
103 |
+
if (not(pd.isna(self.pdf_url))):
|
104 |
+
paper_name += self.pdf_url
|
105 |
+
|
106 |
+
h = hashlib.sha256()
|
107 |
+
h.update(paper_name.encode("utf-8"))
|
108 |
+
return h.hexdigest()[:16]
|
109 |
+
|
110 |
+
def log(self, level, message):
|
111 |
+
self.logs.append({
|
112 |
+
"timestamp": datetime.datetime.utcnow().isoformat(),
|
113 |
+
"level": LogType[level.upper()], # "ERROR", "WARNING", "NOTE", etc.
|
114 |
+
"message": message
|
115 |
+
})
|
116 |
+
|
117 |
+
if (self.verbose == 0):
|
118 |
+
return
|
119 |
+
|
120 |
+
show_tips = (self.verbose == 2) | (self.verbose == 4)
|
121 |
+
if ((self.verbose == 1) | (self.verbose == 2)):
|
122 |
+
show = print
|
123 |
+
if ((self.verbose == 3) | (self.verbose == 4)):
|
124 |
+
show = st.write
|
125 |
+
|
126 |
+
# Align line-break
|
127 |
+
if (log_text.startswith("\n")):
|
128 |
+
show("\n")
|
129 |
+
log_text = log_text.lstrip('\n')
|
130 |
+
|
131 |
+
# Only show tips in verbose mode 2 and 4
|
132 |
+
if ((level == "TITLE") & show_tips):
|
133 |
+
show(f"\n#### {log_text}")
|
134 |
+
if ((level == "TIP") & show_tips):
|
135 |
+
show(f"*{log_text}*")
|
136 |
+
if ((level == "LOG") & show_tips):
|
137 |
+
show(f"{log_text}")
|
138 |
+
if ((level == "ERROR")):
|
139 |
+
show(f"**{log_text}**")
|
140 |
+
|
141 |
+
if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")):
|
142 |
+
raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
|
143 |
+
|
144 |
+
def to_dict(self):
|
145 |
+
return {
|
146 |
+
"title": self.title,
|
147 |
+
"venue": self.venue,
|
148 |
+
"year": self.year,
|
149 |
+
"pdf_url": self.pdf_url,
|
150 |
+
"paper_id": self.paper_id,
|
151 |
+
"urls_auto": json.dumps(self.urls_auto),
|
152 |
+
"urls_manual": json.dumps(self.urls_manual),
|
153 |
+
"logs": json.dumps([
|
154 |
+
{"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]}
|
155 |
+
for log in self.logs
|
156 |
+
]),
|
157 |
+
"code_reproducibility_manual": json.dumps(self.code_repro_manual),
|
158 |
+
"code_reproducibility_auto": json.dumps(self.code_repro_auto),
|
159 |
+
|
160 |
+
}
|
data/dump.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/fetch_miccai.py
DELETED
@@ -1,60 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import requests
|
3 |
-
import re
|
4 |
-
from multiprocessing import Pool, cpu_count
|
5 |
-
from functools import partial
|
6 |
-
|
7 |
-
# Function to process each URL
|
8 |
-
def process_paper(year, url):
|
9 |
-
try:
|
10 |
-
paper_page = requests.get(url).text
|
11 |
-
|
12 |
-
# Find title
|
13 |
-
title_pattern = r'<title>(.*?)\s*</title>'
|
14 |
-
title_match = re.search(title_pattern, paper_page, re.DOTALL)
|
15 |
-
title = title_match.group(1)
|
16 |
-
|
17 |
-
# Find the code repository link
|
18 |
-
code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
|
19 |
-
code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
|
20 |
-
code_repo_link = code_repo_match.group(1) if code_repo_match else ""
|
21 |
-
|
22 |
-
# Find the dataset information
|
23 |
-
dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
|
24 |
-
dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
|
25 |
-
dataset_info = "Yes" if dataset_match else "No"
|
26 |
-
|
27 |
-
# Return a dictionary of the results
|
28 |
-
return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
|
29 |
-
|
30 |
-
except Exception as e:
|
31 |
-
print(f"Error processing {url}: {e}")
|
32 |
-
return None
|
33 |
-
|
34 |
-
current_year = 2024 # Update with the current year
|
35 |
-
MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
|
36 |
-
MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
|
37 |
-
years = [2021, 2022, 2023]
|
38 |
-
# Set debug mode
|
39 |
-
debug = False
|
40 |
-
|
41 |
-
# Fetch all URLs for each year
|
42 |
-
all_year_urls = []
|
43 |
-
for i in range(len(MICCAI_pages)):
|
44 |
-
year_page = requests.get(MICCAI_pages[i]).text
|
45 |
-
print(year_page)
|
46 |
-
urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
|
47 |
-
all_year_urls.extend([(years[i], url) for url in urls])
|
48 |
-
|
49 |
-
print(all_year_urls)
|
50 |
-
# Parallel processing using Pool
|
51 |
-
# if __name__ == "__main__":
|
52 |
-
# with Pool(processes=12) as pool: # Use 12 processes
|
53 |
-
# results = pool.starmap(process_paper, all_year_urls)
|
54 |
-
|
55 |
-
# # Filter out any None results due to errors
|
56 |
-
# results = [result for result in results if result is not None]
|
57 |
-
|
58 |
-
# miccai = pd.DataFrame(results)
|
59 |
-
# # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
|
60 |
-
# miccai.to_csv('miccai.csv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/fetch_processed.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
import csv
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
-
import re
|
5 |
-
|
6 |
-
current_year = 2024
|
7 |
-
MIDL_years = range(2018, current_year + 1, 1)
|
8 |
-
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
9 |
-
|
10 |
-
for venue in custom_order:
|
11 |
-
df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
|
12 |
-
df = df.replace('\t', ' ', regex=True)
|
13 |
-
df.to_csv(f'data/{venue}.csv', sep="\t")
|
14 |
-
|
15 |
-
# Store all evaluations here
|
16 |
-
paper_dump = pd.DataFrame()
|
17 |
-
# Official color codes for conferences
|
18 |
-
MIDL_colors = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000"]
|
19 |
-
|
20 |
-
for venue in custom_order:
|
21 |
-
with open(f'data/{venue}.csv') as file:
|
22 |
-
tsv_file = csv.reader(file, delimiter="\t")
|
23 |
-
for row in tsv_file:
|
24 |
-
if (row[0] == ""):
|
25 |
-
continue
|
26 |
-
|
27 |
-
if (row[1] == ""):
|
28 |
-
continue
|
29 |
-
|
30 |
-
paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]]})], ignore_index=True)
|
31 |
-
paper_dump.to_csv(f'data/dump.csv', sep="\t")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/fetch_zips.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
import csv
|
2 |
-
import numpy as np
|
3 |
-
import sys
|
4 |
-
import pandas as pd
|
5 |
-
import re
|
6 |
-
sys.path.append("./")
|
7 |
-
from evaluations.utils import *
|
8 |
-
|
9 |
-
token = os.getenv("githubToken")
|
10 |
-
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
11 |
-
|
12 |
-
for venue in custom_order:
|
13 |
-
df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
|
14 |
-
df = df.replace('\t', ' ', regex=True)
|
15 |
-
df.to_csv(f'data/{venue}.csv', sep="\t")
|
16 |
-
|
17 |
-
# Store all evaluations here
|
18 |
-
paper_dump = pd.DataFrame()
|
19 |
-
# Official color codes for conferences
|
20 |
-
zip_idx = 0
|
21 |
-
|
22 |
-
for venue in custom_order:
|
23 |
-
with open(f'data/{venue}.csv') as file:
|
24 |
-
tsv_file = csv.reader(file, delimiter="\t")
|
25 |
-
for row in tsv_file:
|
26 |
-
if (row[0] == ""):
|
27 |
-
continue
|
28 |
-
|
29 |
-
if (row[1] == ""):
|
30 |
-
continue
|
31 |
-
|
32 |
-
repo_url = row[4]
|
33 |
-
username, repo_name = decompose_url(repo_url)
|
34 |
-
repo_save_name = f"repo_{zip_idx}.zip"
|
35 |
-
repository_zip_name = f"data/test/{repo_save_name}"
|
36 |
-
log(0, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
|
37 |
-
fetch_repo(0, repo_url, repository_zip_name, token)
|
38 |
-
|
39 |
-
if (os.path.exists(repository_zip_name)):
|
40 |
-
paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]], "zip_idx": [ repository_zip_name ]})], ignore_index=True)
|
41 |
-
zip_idx += 1
|
42 |
-
|
43 |
-
paper_dump.to_csv(f'data/zipfiles.csv', sep="\t")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/zipfiles.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data_generation/fetch_processed.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
4 |
+
sys.path.append(ROOT_DIR)
|
5 |
+
import csv
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import re
|
9 |
+
import os
|
10 |
+
from core.paper import Paper
|
11 |
+
|
12 |
+
import sys
|
13 |
+
import os
|
14 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
15 |
+
sys.path.append(ROOT_DIR)
|
16 |
+
import csv
|
17 |
+
import numpy as np
|
18 |
+
import sys
|
19 |
+
import pandas as pd
|
20 |
+
import re
|
21 |
+
from evaluations.utils import *
|
22 |
+
import pandas as pd
|
23 |
+
from evaluations.url import fetch_url
|
24 |
+
from concurrent.futures import ThreadPoolExecutor
|
25 |
+
import os
|
26 |
+
from core.paper import Paper
|
27 |
+
from core.conversion import download_repo
|
28 |
+
from tqdm import tqdm
|
29 |
+
from concurrent.futures import ProcessPoolExecutor
|
30 |
+
|
31 |
+
import sys
|
32 |
+
import os
|
33 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
34 |
+
sys.path.append(ROOT_DIR)
|
35 |
+
import csv
|
36 |
+
import numpy as np
|
37 |
+
import sys
|
38 |
+
import pandas as pd
|
39 |
+
import re
|
40 |
+
from evaluations.utils import *
|
41 |
+
import pandas as pd
|
42 |
+
from evaluations.url import fetch_url
|
43 |
+
from concurrent.futures import ThreadPoolExecutor
|
44 |
+
import os
|
45 |
+
from core.paper import Paper
|
46 |
+
from core.conversion import download_repo, pdf_to_grobid
|
47 |
+
from tqdm import tqdm
|
48 |
+
from concurrent.futures import ProcessPoolExecutor
|
49 |
+
from config.constants import VENUE_ORDER
|
50 |
+
|
51 |
+
import pandas as pd
|
52 |
+
import sys
|
53 |
+
import os
|
54 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
55 |
+
sys.path.append(ROOT_DIR)
|
56 |
+
from evaluations.url import fetch_url
|
57 |
+
from concurrent.futures import ThreadPoolExecutor
|
58 |
+
import os
|
59 |
+
from core.paper import Paper
|
60 |
+
from core.conversion import download_pdf
|
61 |
+
from tqdm import tqdm
|
62 |
+
from concurrent.futures import ProcessPoolExecutor
|
63 |
+
|
64 |
+
def get_urls_and_assign(paper):
|
65 |
+
try:
|
66 |
+
urls = fetch_url(paper.pdf_path)
|
67 |
+
paper.urls_auto = urls # Just update this
|
68 |
+
except Exception as e:
|
69 |
+
paper.log("ERROR", str(e))
|
70 |
+
return paper
|
71 |
+
|
72 |
+
def download_xml(paper):
|
73 |
+
try:
|
74 |
+
if (paper.pdf_path is None):
|
75 |
+
return paper
|
76 |
+
|
77 |
+
if (os.path.exists(paper.xml_path)):
|
78 |
+
paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.")
|
79 |
+
return paper
|
80 |
+
|
81 |
+
pdf_to_grobid(paper.pdf_path, paper.xml_path)
|
82 |
+
|
83 |
+
return paper
|
84 |
+
except Exception as e:
|
85 |
+
paper.log("ERROR", f"Repo download failed: {e}")
|
86 |
+
return paper
|
87 |
+
|
88 |
+
|
89 |
+
max_workers = 6
|
90 |
+
if __name__ == "__main__":
|
91 |
+
for venue in VENUE_ORDER:
|
92 |
+
df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
|
93 |
+
df = df.replace('\t', ' ', regex=True)
|
94 |
+
df = df.replace('[]', '')
|
95 |
+
df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
|
96 |
+
|
97 |
+
papers = []
|
98 |
+
for venue in VENUE_ORDER:
|
99 |
+
paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
|
100 |
+
paper_list["Venue"] = venue
|
101 |
+
for _, row in paper_list.iterrows():
|
102 |
+
if (row.iloc[0] == ""):
|
103 |
+
continue
|
104 |
+
|
105 |
+
if (row.iloc[1] == ""):
|
106 |
+
continue
|
107 |
+
papers.append(Paper.from_raw(row))
|
108 |
+
|
109 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
110 |
+
papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs"))
|
111 |
+
|
112 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
113 |
+
papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs"))
|
114 |
+
|
115 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
116 |
+
papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos"))
|
117 |
+
|
118 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
119 |
+
papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs"))
|
120 |
+
|
121 |
+
results = [p.to_dict() for p in papers]
|
122 |
+
results_df = pd.DataFrame(results)
|
123 |
+
results_df.to_csv("data/papers.csv", sep="\t", index=False)
|
{data → data_generation/paper_scraping}/fetch_arxiv.py
RENAMED
@@ -1,10 +1,23 @@
|
|
1 |
import pandas as pd
|
2 |
import requests
|
3 |
import pdfplumber
|
|
|
4 |
import re
|
5 |
from multiprocessing import Pool, cpu_count
|
6 |
from functools import partial
|
|
|
7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# Function to process each URL
|
9 |
def process_arxiv_paper(article_link):
|
10 |
try:
|
@@ -16,69 +29,70 @@ def process_arxiv_paper(article_link):
|
|
16 |
|
17 |
article_id = article_link.split("/")[-1]
|
18 |
pdf_url = f'https://arxiv.org/pdf/{article_id}'
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
if (response.status_code == 404):
|
24 |
-
print("Failed to fetch pdf")
|
25 |
-
return None
|
26 |
-
|
27 |
-
urls = []
|
28 |
-
link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
|
29 |
-
with pdfplumber.open(f"{article_id}.pdf") as pdf:
|
30 |
-
# Loop through all pages
|
31 |
-
for page_num, page in enumerate(pdf.pages):
|
32 |
-
# Extract text from the page
|
33 |
-
text = page.extract_text()
|
34 |
-
|
35 |
-
# Search for a specific word or phrase
|
36 |
-
found_urls = re.findall(link_pattern, text)
|
37 |
-
urls.extend(found_urls)
|
38 |
-
os.remove(f"{article_id}.pdf")
|
39 |
-
urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
|
40 |
-
print(urls)
|
41 |
-
url = urls[0] if len(urls) > 0 else ""
|
42 |
-
|
43 |
-
# Return a dictionary of the results
|
44 |
-
return {"venue": "arXiv", "title": title, "url": url, "year": year}
|
45 |
|
46 |
except Exception as e:
|
47 |
print(f"Error processing {article_link}: {e}")
|
48 |
return None
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
all_year_urls = []
|
54 |
|
55 |
-
page_size = 50
|
56 |
-
search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
|
57 |
articles = []
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
start_idx = page_size * page
|
62 |
-
url = f"{search_query}{start_idx}"
|
63 |
-
current_page = requests.get(url).text
|
64 |
-
pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
|
65 |
-
matches = re.findall(pattern, current_page)
|
66 |
-
if (len(matches) == 0):
|
67 |
-
break
|
68 |
-
else:
|
69 |
-
page += 1
|
70 |
|
71 |
-
articles += matches
|
72 |
-
articles = np.unique(articles)
|
73 |
|
74 |
# Parallel processing using Pool
|
75 |
if __name__ == "__main__":
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# Filter out any None results due to errors
|
80 |
results = [result for result in results if result is not None]
|
81 |
|
82 |
# Convert the list of dictionaries to a DataFrame
|
83 |
arxiv = pd.DataFrame(results)
|
84 |
-
arxiv.to_csv('arxiv.csv')
|
|
|
1 |
import pandas as pd
|
2 |
import requests
|
3 |
import pdfplumber
|
4 |
+
import numpy as np
|
5 |
import re
|
6 |
from multiprocessing import Pool, cpu_count
|
7 |
from functools import partial
|
8 |
+
import urllib, urllib.request
|
9 |
import os
|
10 |
+
import sys
|
11 |
+
from tqdm import tqdm
|
12 |
+
from tqdm.contrib.concurrent import process_map # better for multiprocessing
|
13 |
+
import feedparser
|
14 |
+
import time
|
15 |
+
from datetime import datetime
|
16 |
+
from tqdm import tqdm
|
17 |
+
|
18 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
19 |
+
sys.path.append(ROOT_DIR)
|
20 |
+
|
21 |
# Function to process each URL
|
22 |
def process_arxiv_paper(article_link):
|
23 |
try:
|
|
|
29 |
|
30 |
article_id = article_link.split("/")[-1]
|
31 |
pdf_url = f'https://arxiv.org/pdf/{article_id}'
|
32 |
+
|
33 |
+
urls = [] # fetch_url(pdf_url)
|
34 |
+
|
35 |
+
return {"title": title, "year": year, "pdf": pdf_url, "url": urls}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
except Exception as e:
|
38 |
print(f"Error processing {article_link}: {e}")
|
39 |
return None
|
40 |
|
41 |
+
page_size = 100
|
42 |
+
base_query = "http://export.arxiv.org/api/query"
|
43 |
+
query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"
|
|
|
44 |
|
|
|
|
|
45 |
articles = []
|
46 |
+
start = 0
|
47 |
+
max_empty_pages = 3 # stop early if we hit consecutive empty pages
|
48 |
+
empty_pages = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
50 |
|
51 |
# Parallel processing using Pool
|
52 |
if __name__ == "__main__":
|
53 |
+
print("Fetching arXiv article URLs...")
|
54 |
+
|
55 |
+
while True:
|
56 |
+
# Build URL with pagination
|
57 |
+
url = f"{base_query}?{query_params}&start={start}"
|
58 |
+
|
59 |
+
# Parse the Atom feed
|
60 |
+
feed = feedparser.parse(url)
|
61 |
+
entries = feed.entries
|
62 |
+
|
63 |
+
if not entries:
|
64 |
+
empty_pages += 1
|
65 |
+
print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}")
|
66 |
+
if empty_pages >= max_empty_pages:
|
67 |
+
print("Stopping early due to repeated empty results.")
|
68 |
+
break
|
69 |
+
time.sleep(4)
|
70 |
+
start += page_size
|
71 |
+
continue
|
72 |
+
|
73 |
+
empty_pages = 0 # reset empty count on success
|
74 |
+
|
75 |
+
for entry in entries:
|
76 |
+
pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
|
77 |
+
if pub_date >= datetime(2018, 1, 1):
|
78 |
+
articles.append(entry.link)
|
79 |
+
|
80 |
+
# Log progress
|
81 |
+
print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")
|
82 |
+
|
83 |
+
# Stop if fewer than full page — probably the last one
|
84 |
+
if len(entries) < page_size:
|
85 |
+
print("Reached last page of results.")
|
86 |
+
break
|
87 |
+
|
88 |
+
start += page_size
|
89 |
+
time.sleep(4) # Respect rate limit
|
90 |
+
articles = np.unique(articles)
|
91 |
+
results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")
|
92 |
|
93 |
# Filter out any None results due to errors
|
94 |
results = [result for result in results if result is not None]
|
95 |
|
96 |
# Convert the list of dictionaries to a DataFrame
|
97 |
arxiv = pd.DataFrame(results)
|
98 |
+
arxiv.to_csv('data/raw/arxiv.csv')
|
data_generation/paper_scraping/fetch_miccai.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
import re
|
4 |
+
from tqdm import tqdm
|
5 |
+
from tqdm.contrib.concurrent import process_map
|
6 |
+
from multiprocessing import cpu_count
|
7 |
+
|
8 |
+
# --- Parse a single paper page ---
|
9 |
+
def process_paper(year, url):
|
10 |
+
try:
|
11 |
+
paper_page = requests.get(url).text
|
12 |
+
|
13 |
+
# Title
|
14 |
+
title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL)
|
15 |
+
title = title_match.group(1).strip() if title_match else ""
|
16 |
+
|
17 |
+
# Code repo link
|
18 |
+
code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL)
|
19 |
+
code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""
|
20 |
+
|
21 |
+
# Dataset info
|
22 |
+
dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL)
|
23 |
+
dataset_info = "Yes" if dataset_match else "No"
|
24 |
+
|
25 |
+
return {
|
26 |
+
"title": title,
|
27 |
+
"year": year,
|
28 |
+
"url": code_repo_link,
|
29 |
+
"public": dataset_info
|
30 |
+
}
|
31 |
+
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error processing {url}: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
# --- Main Execution ---
|
37 |
+
if __name__ == "__main__":
|
38 |
+
MICCAI_pages = [
|
39 |
+
"https://miccai2021.org/openaccess/paperlinks/",
|
40 |
+
"https://conferences.miccai.org/2022/papers/",
|
41 |
+
"https://conferences.miccai.org/2023/papers/",
|
42 |
+
"https://papers.miccai.org/miccai-2024/"
|
43 |
+
]
|
44 |
+
MICCAI_root = [
|
45 |
+
"https://miccai2021.org/openaccess/paperlinks/",
|
46 |
+
"https://conferences.miccai.org",
|
47 |
+
"https://conferences.miccai.org",
|
48 |
+
"https://papers.miccai.org"
|
49 |
+
]
|
50 |
+
years = [2021, 2022, 2023, 2024]
|
51 |
+
|
52 |
+
all_year_urls = []
|
53 |
+
|
54 |
+
print("🔍 Fetching paper URLs by year...")
|
55 |
+
for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
|
56 |
+
try:
|
57 |
+
response = requests.get(MICCAI_pages[i])
|
58 |
+
year_page = response.text
|
59 |
+
if years[i] == 2024:
|
60 |
+
matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
|
61 |
+
urls = [MICCAI_root[i] + match for match in matches]
|
62 |
+
else:
|
63 |
+
urls = [
|
64 |
+
MICCAI_root[i] + line.split('href="')[1].split('"')[0]
|
65 |
+
for line in year_page.split('\n')
|
66 |
+
if "&bullet" in line and 'href="' in line
|
67 |
+
]
|
68 |
+
all_year_urls.extend([(years[i], url) for url in urls])
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Failed to fetch year {years[i]}: {e}")
|
71 |
+
|
72 |
+
print(f"📄 Total papers found: {len(all_year_urls)}")
|
73 |
+
|
74 |
+
# --- Parallel scrape each paper page ---
|
75 |
+
print("⚙️ Processing paper metadata...")
|
76 |
+
results = process_map(
|
77 |
+
process_paper,
|
78 |
+
[item[0] for item in all_year_urls],
|
79 |
+
[item[1] for item in all_year_urls],
|
80 |
+
max_workers=12,
|
81 |
+
chunksize=1,
|
82 |
+
desc="Parsing Papers"
|
83 |
+
)
|
84 |
+
|
85 |
+
results = [r for r in results if r is not None]
|
86 |
+
|
87 |
+
miccai = pd.DataFrame(results)
|
88 |
+
miccai.to_csv('data/raw/miccai.csv', index=False)
|
89 |
+
print("✅ Saved to data/miccai.csv")
|
{data → data_generation/paper_scraping}/fetch_nature.py
RENAMED
@@ -1,19 +1,23 @@
|
|
1 |
import pandas as pd
|
2 |
import requests
|
|
|
3 |
import re
|
4 |
from multiprocessing import Pool, cpu_count
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
# Function to process each URL
|
8 |
def process_nature_paper(article_link):
|
9 |
try:
|
10 |
-
|
11 |
-
article_text = requests.get(
|
12 |
|
13 |
pattern = r'Code availability.*?<a href="([^"]+)"'
|
14 |
matches = re.findall(pattern, article_text, re.DOTALL)
|
15 |
urls = [link for link in matches if "github" in link]
|
16 |
-
url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
|
17 |
|
18 |
year = re.findall(r'datetime="(\d{4})', article_text)[0]
|
19 |
# # Find title
|
@@ -25,10 +29,10 @@ def process_nature_paper(article_link):
|
|
25 |
dataset_info = "Yes" if (len(matches) > 0) else "No"
|
26 |
|
27 |
# # Return a dictionary of the results
|
28 |
-
return {"title": title, "
|
29 |
|
30 |
except Exception as e:
|
31 |
-
print(f"Error processing {
|
32 |
return None
|
33 |
|
34 |
# Set debug mode
|
@@ -36,33 +40,29 @@ debug = False
|
|
36 |
|
37 |
# Fetch all URLs for each year
|
38 |
all_year_urls = []
|
39 |
-
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
|
40 |
articles = []
|
41 |
-
|
|
|
42 |
page = 1
|
43 |
-
while
|
44 |
url = f"{search_query}{page}"
|
45 |
current_page = requests.get(url).text
|
46 |
pattern = r'href="/articles/([^"]+)"'
|
47 |
matches = re.findall(pattern, current_page)
|
48 |
-
if
|
49 |
break
|
50 |
else:
|
51 |
page += 1
|
52 |
-
|
53 |
-
articles += matches
|
54 |
articles = np.unique(articles)
|
55 |
|
56 |
|
57 |
# Parallel processing using Pool
|
58 |
if __name__ == "__main__":
|
59 |
-
|
60 |
-
results = pool.starmap(process_nature_paper, [[article] for article in articles])
|
61 |
-
|
62 |
-
# Filter out any None results due to errors
|
63 |
results = [result for result in results if result is not None]
|
64 |
|
65 |
-
# Convert the list of dictionaries to a DataFrame
|
66 |
nature = pd.DataFrame(results)
|
67 |
nature = nature[['title', 'year', 'pdf', 'url', 'public']]
|
68 |
-
nature.to_csv('nature.csv')
|
|
|
1 |
import pandas as pd
|
2 |
import requests
|
3 |
+
import os
|
4 |
import re
|
5 |
from multiprocessing import Pool, cpu_count
|
6 |
+
import numpy as np
|
7 |
+
from tqdm import tqdm
|
8 |
+
from tqdm.contrib.concurrent import process_map # better for multiprocessing
|
9 |
+
|
10 |
|
11 |
# Function to process each URL
|
12 |
def process_nature_paper(article_link):
|
13 |
try:
|
14 |
+
pdf_url = f'https://www.nature.com/articles/{article_link}'
|
15 |
+
article_text = requests.get(pdf_url).text
|
16 |
|
17 |
pattern = r'Code availability.*?<a href="([^"]+)"'
|
18 |
matches = re.findall(pattern, article_text, re.DOTALL)
|
19 |
urls = [link for link in matches if "github" in link]
|
20 |
+
# url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
|
21 |
|
22 |
year = re.findall(r'datetime="(\d{4})', article_text)[0]
|
23 |
# # Find title
|
|
|
29 |
dataset_info = "Yes" if (len(matches) > 0) else "No"
|
30 |
|
31 |
# # Return a dictionary of the results
|
32 |
+
return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info}
|
33 |
|
34 |
except Exception as e:
|
35 |
+
print(f"Error processing {pdf_url}: {e}")
|
36 |
return None
|
37 |
|
38 |
# Set debug mode
|
|
|
40 |
|
41 |
# Fetch all URLs for each year
|
42 |
all_year_urls = []
|
43 |
+
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="]
|
44 |
articles = []
|
45 |
+
|
46 |
+
for search_query in tqdm(search_queries, desc="Search Queries"):
|
47 |
page = 1
|
48 |
+
while page <= 100:
|
49 |
url = f"{search_query}{page}"
|
50 |
current_page = requests.get(url).text
|
51 |
pattern = r'href="/articles/([^"]+)"'
|
52 |
matches = re.findall(pattern, current_page)
|
53 |
+
if not matches:
|
54 |
break
|
55 |
else:
|
56 |
page += 1
|
57 |
+
articles += matches
|
|
|
58 |
articles = np.unique(articles)
|
59 |
|
60 |
|
61 |
# Parallel processing using Pool
|
62 |
if __name__ == "__main__":
|
63 |
+
results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles")
|
|
|
|
|
|
|
64 |
results = [result for result in results if result is not None]
|
65 |
|
|
|
66 |
nature = pd.DataFrame(results)
|
67 |
nature = nature[['title', 'year', 'pdf', 'url', 'public']]
|
68 |
+
nature.to_csv('data/raw/nature.csv')
|
evaluations/documentation.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
-
from .utils import
|
2 |
import re
|
3 |
import numpy as np
|
|
|
4 |
|
5 |
-
def is_applicable(
|
6 |
res_training = "NA"
|
7 |
res_evaluation = "NA"
|
8 |
res_weights = "NA"
|
9 |
|
10 |
if (llm):
|
11 |
-
|
12 |
res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
|
13 |
res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
|
14 |
res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
|
@@ -16,31 +17,28 @@ def is_applicable(verbose, llm, readme):
|
|
16 |
applicable = f"{res_training}/{res_evaluation}/{res_weights}"
|
17 |
return applicable
|
18 |
|
19 |
-
def evaluate(
|
20 |
-
|
21 |
overall = "No"
|
22 |
|
23 |
-
|
24 |
code_to_comment_ratio = get_code_to_comment_ratio(zip)
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
if (readme):
|
29 |
non_empty_rows = [row for row in readme.split("\n") if row != ""]
|
30 |
if (len(non_empty_rows) < 5):
|
31 |
-
|
32 |
|
33 |
if (llm):
|
34 |
code = fetch_code(zip)
|
35 |
if (llm):
|
36 |
summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
|
37 |
-
|
38 |
return overall
|
39 |
|
40 |
-
if (count_code_lines(non_empty_rows) >
|
41 |
-
|
42 |
-
|
43 |
-
return overall
|
44 |
|
45 |
|
46 |
if (llm):
|
@@ -50,26 +48,30 @@ def evaluate(verbose, llm, zip, readme):
|
|
50 |
and evaluate the proposed model?'
|
51 |
llm.predict("HELP", prompt)
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
manual_fail = False
|
55 |
if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
log(verbose, "ERROR", "Readme file missing testing information")
|
60 |
-
overall = "No"
|
61 |
|
62 |
-
if ((len(re.findall("
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
-
if ((len(re.findall("
|
67 |
-
(
|
68 |
-
|
69 |
-
|
70 |
-
overall = "No"
|
71 |
|
72 |
-
|
|
|
73 |
|
74 |
def count_comment_lines(lines):
|
75 |
# Initialize counters
|
|
|
1 |
+
from .utils import fetch_code
|
2 |
import re
|
3 |
import numpy as np
|
4 |
+
from core.conversion import noop_logger
|
5 |
|
6 |
+
def is_applicable(llm, readme, log_fn=noop_logger):
|
7 |
res_training = "NA"
|
8 |
res_evaluation = "NA"
|
9 |
res_weights = "NA"
|
10 |
|
11 |
if (llm):
|
12 |
+
log_fn("TITLE", "\nChecking what parts of the evaluations are applicable...")
|
13 |
res_training = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for training a model?")
|
14 |
res_evaluation = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for evaluating a model?")
|
15 |
res_weights = llm.predict("STRICT", f"{readme}\nBased on the readme above, should the repository contain code for loading pre-trained weights?")
|
|
|
17 |
applicable = f"{res_training}/{res_evaluation}/{res_weights}"
|
18 |
return applicable
|
19 |
|
20 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
21 |
+
log_fn("TITLE", "\nEvaluating code documentation...")
|
22 |
overall = "No"
|
23 |
|
|
|
24 |
code_to_comment_ratio = get_code_to_comment_ratio(zip)
|
25 |
+
log_fn("LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
|
26 |
+
result = { "dependencies": "No", "training": "No", "evaluation": "No", "weights": "No", "scripts": "No" }
|
27 |
+
for readme in readmes:
|
|
|
28 |
non_empty_rows = [row for row in readme.split("\n") if row != ""]
|
29 |
if (len(non_empty_rows) < 5):
|
30 |
+
log_fn("ERROR", "Readme file has very few lines")
|
31 |
|
32 |
if (llm):
|
33 |
code = fetch_code(zip)
|
34 |
if (llm):
|
35 |
summary = llm.predict("HELP", f"{code}\nBased on the readme file above can you give a quick summary of this repository? Please use references to file names on the repository.")
|
36 |
+
log_fn("LOG", f"Based on the code, your readme file could be something like...\n{summary}")
|
37 |
return overall
|
38 |
|
39 |
+
if (count_code_lines(non_empty_rows) > 2):
|
40 |
+
log_fn("LOG", "Readme file contains python examples.")
|
41 |
+
result["scripts"] = "Yes"
|
|
|
42 |
|
43 |
|
44 |
if (llm):
|
|
|
48 |
and evaluate the proposed model?'
|
49 |
llm.predict("HELP", prompt)
|
50 |
|
51 |
+
if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
|
52 |
+
(len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
|
53 |
+
(len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
|
54 |
+
log_fn("ERROR", "Readme file missing information about package dependencies")
|
55 |
+
else:
|
56 |
+
result["dependencies"] = "Yes"
|
57 |
|
|
|
58 |
if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
|
59 |
+
log_fn("ERROR", "Readme file missing training information")
|
60 |
+
else:
|
61 |
+
result["training"] = "Yes"
|
|
|
|
|
62 |
|
63 |
+
if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
|
64 |
+
log_fn("ERROR", "Readme file missing testing information")
|
65 |
+
else:
|
66 |
+
result["evaluating"] = "Yes"
|
67 |
|
68 |
+
if ((len(re.findall("example", readme, re.IGNORECASE)) == 0)):
|
69 |
+
log_fn("LOG", "Readme file contains no links to examples")
|
70 |
+
else:
|
71 |
+
result["evaluating"] = "Yes"
|
|
|
72 |
|
73 |
+
score = np.sum(np.array(list(result.values()), dtype=str) == "Yes")
|
74 |
+
return "Yes" if score >= 2 else "No"
|
75 |
|
76 |
def count_comment_lines(lines):
|
77 |
# Initialize counters
|
evaluations/license.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
from .utils import log
|
2 |
import re
|
|
|
3 |
|
4 |
-
def evaluate(
|
5 |
-
|
6 |
overall = "No"
|
7 |
-
license_files = [
|
8 |
if (len(license_files) > 0):
|
9 |
license = zip.open(license_files[0]).read().decode("utf-8")
|
10 |
ans = [row for row in license.split("\n") if row != ""]
|
@@ -13,18 +13,18 @@ def evaluate(verbose, llm, zip, readme):
|
|
13 |
license = license
|
14 |
prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
|
15 |
ans = llm.predict("HELP", prompt)
|
16 |
-
|
17 |
else:
|
18 |
-
|
19 |
|
20 |
overall = "Yes"
|
21 |
return overall
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
|
30 |
return overall
|
|
|
|
|
1 |
import re
|
2 |
+
from core.conversion import noop_logger
|
3 |
|
4 |
+
def evaluate(llm, zip, readme, log_fn=noop_logger):
|
5 |
+
log_fn("TITLE", "\nEvaluating repository licensing...")
|
6 |
overall = "No"
|
7 |
+
license_files = [license_path for license_path in zip.namelist() if ((("license" in license_path.lower())) & (len(license_path.split("/")) == 2))]
|
8 |
if (len(license_files) > 0):
|
9 |
license = zip.open(license_files[0]).read().decode("utf-8")
|
10 |
ans = [row for row in license.split("\n") if row != ""]
|
|
|
13 |
license = license
|
14 |
prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
|
15 |
ans = llm.predict("HELP", prompt)
|
16 |
+
log_fn("LOG", f"Found license: {ans}")
|
17 |
else:
|
18 |
+
log_fn("LOG", f"Found license file: {license_files[0]}")
|
19 |
|
20 |
overall = "Yes"
|
21 |
return overall
|
22 |
|
23 |
+
for readme_file in readme:
|
24 |
+
if ("license" in readme_file.lower()):
|
25 |
+
log_fn("LOG", "License found in README.")
|
26 |
+
overall = "Yes"
|
27 |
+
return overall
|
28 |
|
29 |
+
log_fn("ERROR", "LICENSE file not found.")
|
30 |
return overall
|
evaluations/pitfalls.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
-
from .utils import
|
2 |
import re
|
|
|
3 |
|
4 |
-
def evaluate(
|
5 |
-
|
6 |
codebase = fetch_code(zip)
|
7 |
|
8 |
if (llm):
|
9 |
for code in codebase:
|
10 |
pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
|
11 |
if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
|
12 |
-
|
13 |
-
|
|
|
1 |
+
from .utils import fetch_code
|
2 |
import re
|
3 |
+
from core.conversion import noop_logger
|
4 |
|
5 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
6 |
+
log_fn("TITLE", "\nLooking for common pitfalls (in development)...")
|
7 |
codebase = fetch_code(zip)
|
8 |
|
9 |
if (llm):
|
10 |
for code in codebase:
|
11 |
pitfall_check = llm.predict("STRICT", f"{codebase[code]}Do you find any signs of serious issues in this code?")
|
12 |
if (("Yes" in pitfall_check) & ("No" not in pitfall_check)):
|
13 |
+
log_fn("ERROR", f"Found possible issues in {code}")
|
14 |
+
log_fn("LOG", llm.predict("PITFALL", f"File name {code} file {codebase[code]}\n Can you find any signs of common pitfalls in this code?"))
|
evaluations/repo_evaluations.py
CHANGED
@@ -2,84 +2,71 @@ import pandas as pd
|
|
2 |
import os
|
3 |
from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
|
4 |
from evaluations.utils import *
|
|
|
|
|
5 |
import zipfile
|
|
|
|
|
6 |
import os
|
7 |
import numpy as np
|
8 |
from huggingface_hub import InferenceClient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
|
11 |
try:
|
12 |
if (not(llm)):
|
13 |
-
|
14 |
|
15 |
-
|
16 |
|
17 |
-
if ((title != None) & (year != None) & (title != "") & (year != "")):
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
if (get_api_link(repo_url) != ""):
|
26 |
-
results["pred_valid"] = True
|
27 |
-
else:
|
28 |
-
return results
|
29 |
-
|
30 |
-
username, repo_name = decompose_url(repo_url)
|
31 |
-
|
32 |
-
# If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
|
33 |
-
if (zip is None):
|
34 |
-
token = os.getenv("githubToken")
|
35 |
-
repository_zip_name = "data/repo.zip"
|
36 |
-
log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
|
37 |
-
|
38 |
-
fetch_repo(verbose, repo_url, repository_zip_name, token)
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
|
45 |
|
46 |
-
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
results["NA"] = documentation.is_applicable(verbose, llm, readme)
|
50 |
-
|
51 |
-
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
52 |
|
53 |
if (len(zip.namelist()) <= 2):
|
54 |
-
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
pitfalls.evaluate(
|
63 |
-
|
64 |
-
return
|
65 |
except Exception as e:
|
66 |
-
|
67 |
-
|
68 |
-
return
|
69 |
-
|
70 |
-
def
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
|
80 |
-
result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
|
81 |
-
for column in result.keys():
|
82 |
-
row[column] = result[column]
|
83 |
-
|
84 |
-
full_results.append(row)
|
85 |
-
return pd.DataFrame(full_results)
|
|
|
2 |
import os
|
3 |
from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
|
4 |
from evaluations.utils import *
|
5 |
+
from core.conversion import fetch_repo, decompose_url
|
6 |
+
|
7 |
import zipfile
|
8 |
+
import csv
|
9 |
+
|
10 |
import os
|
11 |
import numpy as np
|
12 |
from huggingface_hub import InferenceClient
|
13 |
+
from concurrent.futures import ThreadPoolExecutor
|
14 |
+
from core.conversion import noop_logger
|
15 |
+
|
16 |
+
token = os.getenv("githubToken")
|
17 |
+
def evaluate(llm, paper, log_fn=noop_logger):
|
18 |
+
repo_url = paper.main_repo_url
|
19 |
+
title = paper.title
|
20 |
+
year = paper.year
|
21 |
+
zip=zipfile.ZipFile(paper.zip_path)
|
22 |
|
|
|
23 |
try:
|
24 |
if (not(llm)):
|
25 |
+
log_fn("LOG", "No LLM will be used for the evaluation.")
|
26 |
|
27 |
+
paper.code_repro_auto = { "live": "Yes", "dependencies": None, "training": None, "evaluation": None, "weights": None, "readme": None, "license": None, "stars": None, "citations": None, "valid": False}
|
28 |
|
29 |
+
# if ((title != None) & (year != None) & (title != "") & (year != "")):
|
30 |
+
# res = fetch_openalex(title, year, log_fn=log_fn)
|
31 |
+
# if ((res != None)):
|
32 |
+
# res = res["results"]
|
33 |
+
# if (len(res) > 0):
|
34 |
+
# res = res[0]
|
35 |
+
# paper.code_repro_auto["citations"] = res["cited_by_count"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
# if (get_api_link(repo_url) != ""):
|
38 |
+
# paper.code_repro_auto["valid"] = True
|
39 |
+
# else:
|
40 |
+
# return paper.code_repro_auto
|
|
|
41 |
|
42 |
+
# paper.code_repro_auto["stars"] = fetch_repo_stars(repo_url, token, log_fn)
|
43 |
+
readmes = fetch_readmes(zip)
|
44 |
+
paper.code_repro_auto["NA"] = documentation.is_applicable(llm, readmes, log_fn)
|
45 |
|
46 |
+
paper.code_repro_auto["license"] = license.evaluate(llm, zip, readmes, log_fn)
|
|
|
|
|
|
|
47 |
|
48 |
if (len(zip.namelist()) <= 2):
|
49 |
+
log_fn("LOG", "The repository is empty.")
|
50 |
|
51 |
+
paper.code_repro_auto["dependencies"] = requirements.evaluate(llm, zip, readmes, log_fn)
|
52 |
+
paper.code_repro_auto["training"] = training.evaluate(llm, zip, readmes, log_fn)
|
53 |
+
paper.code_repro_auto["evaluation"] = validating.evaluate(llm, zip, readmes, log_fn)
|
54 |
+
paper.code_repro_auto["weights"] = weights.evaluate(llm, zip, readmes, log_fn)
|
55 |
+
paper.code_repro_auto["readme"] = documentation.evaluate(llm, zip, readmes, log_fn)
|
56 |
+
paper.code_repro_auto["codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
57 |
+
pitfalls.evaluate(llm, zip, readmes, log_fn)
|
58 |
+
|
59 |
+
return paper
|
60 |
except Exception as e:
|
61 |
+
log_fn("ERROR", "Evaluating repository failed: " + str(e))
|
62 |
+
paper.code_repro_auto["live"] = "No"
|
63 |
+
return paper
|
64 |
+
|
65 |
+
def process_row(paper):
|
66 |
+
if ((paper.zip_path is None) or (not(os.path.exists(paper.zip_path)))):
|
67 |
+
paper.log("ERROR", "Zip file doesn't exist")
|
68 |
+
return paper
|
69 |
+
|
70 |
+
paper = evaluate(None, paper, paper.log)
|
71 |
+
|
72 |
+
return paper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluations/requirements.py
CHANGED
@@ -1,24 +1,26 @@
|
|
1 |
-
from .
|
2 |
|
3 |
-
def evaluate(
|
4 |
-
|
5 |
overall = "No"
|
6 |
|
7 |
scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
|
8 |
|
9 |
files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
|
10 |
-
files = [file_path for file_path in files if len(file_path.split("/")) == 2]
|
11 |
for file in files:
|
12 |
-
|
13 |
requirements = zip.open(file).read().decode("utf-8")
|
14 |
-
|
15 |
if (len(requirements.split("\n")) < 5):
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
|
24 |
return overall
|
|
|
1 |
+
from core.conversion import noop_logger
|
2 |
|
3 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
4 |
+
log_fn("TITLE", "\nLooking for package dependencies for running the code...")
|
5 |
overall = "No"
|
6 |
|
7 |
scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
|
8 |
|
9 |
files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
|
10 |
+
# files = [file_path for file_path in files if len(file_path.split("/")) == 2]
|
11 |
for file in files:
|
12 |
+
log_fn("LOG", f"Found requirements file: {file}")
|
13 |
requirements = zip.open(file).read().decode("utf-8")
|
14 |
+
|
15 |
if (len(requirements.split("\n")) < 5):
|
16 |
+
log_fn("ERROR", "Requirements file contains too few lines.")
|
17 |
+
continue
|
18 |
+
overall = "Yes"
|
19 |
+
|
20 |
+
for readme in readmes:
|
21 |
+
if (readme):
|
22 |
+
if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
|
23 |
+
log_fn("LOG", "Found dependencies in README file")
|
24 |
+
overall = "Yes"
|
25 |
|
26 |
return overall
|
evaluations/training.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
from .utils import log
|
2 |
import re
|
|
|
3 |
|
4 |
-
def evaluate(
|
5 |
-
|
6 |
overall = "No"
|
7 |
|
8 |
|
@@ -23,15 +23,15 @@ def evaluate(verbose, llm, zip, readme):
|
|
23 |
for framework, regex_list in patterns.items():
|
24 |
for pattern in regex_list:
|
25 |
if re.search(pattern, code):
|
26 |
-
|
27 |
overall = "Yes"
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
if (overall == "No"):
|
36 |
-
|
37 |
return overall
|
|
|
|
|
1 |
import re
|
2 |
+
from core.conversion import noop_logger
|
3 |
|
4 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
5 |
+
log_fn("TITLE", "\nLooking for code to train the model...")
|
6 |
overall = "No"
|
7 |
|
8 |
|
|
|
23 |
for framework, regex_list in patterns.items():
|
24 |
for pattern in regex_list:
|
25 |
if re.search(pattern, code):
|
26 |
+
log_fn("LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
|
27 |
overall = "Yes"
|
28 |
|
29 |
+
for readme in readmes:
|
30 |
+
if (readme):
|
31 |
+
if (("train" in readme)):
|
32 |
+
log_fn("LOG", "Found something about training in README file")
|
33 |
+
overall = "Yes"
|
34 |
|
35 |
if (overall == "No"):
|
36 |
+
log_fn("ERROR", "Found no code for training the model.")
|
37 |
return overall
|
evaluations/url.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import zipfile
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from uuid import uuid4
|
7 |
+
import pdfplumber
|
8 |
+
import numpy as np
|
9 |
+
from urllib.request import urlretrieve
|
10 |
+
import xml.etree.ElementTree as ET
|
11 |
+
import re
|
12 |
+
import fitz # PyMuPDF
|
13 |
+
|
14 |
+
def get_fitz_urls(pdf_path):
|
15 |
+
doc = fitz.open(pdf_path)
|
16 |
+
urls = []
|
17 |
+
|
18 |
+
for page in doc:
|
19 |
+
for link in page.get_links():
|
20 |
+
if 'uri' in link:
|
21 |
+
urls.append(link['uri'])
|
22 |
+
|
23 |
+
return urls
|
24 |
+
|
25 |
+
|
26 |
+
NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}
|
27 |
+
|
28 |
+
def find_pattern_in_xml(root, pattern):
|
29 |
+
"""
|
30 |
+
Recursively search for a regex pattern in all text fields of an XML tree.
|
31 |
+
|
32 |
+
:param root: The root Element of the XML tree
|
33 |
+
:param pattern: The regex pattern to search for
|
34 |
+
:return: A list of matching strings
|
35 |
+
"""
|
36 |
+
matches = []
|
37 |
+
regex = re.compile(pattern)
|
38 |
+
|
39 |
+
# Check element text
|
40 |
+
if root.text:
|
41 |
+
matches.extend(regex.findall(root.text))
|
42 |
+
|
43 |
+
# Check element attributes
|
44 |
+
for attr_value in root.attrib.values():
|
45 |
+
matches.extend(regex.findall(attr_value))
|
46 |
+
|
47 |
+
# Recursively search in children
|
48 |
+
for child in root:
|
49 |
+
matches.extend(find_pattern_in_xml(child, pattern))
|
50 |
+
|
51 |
+
return matches
|
52 |
+
|
53 |
+
def fetch_url(pdf_path):
|
54 |
+
if (pdf_path is None):
|
55 |
+
raise ValueError("Pdf has no path")
|
56 |
+
|
57 |
+
urls = []
|
58 |
+
link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b"
|
59 |
+
|
60 |
+
# if (method == "plumber"):
|
61 |
+
full_text = ""
|
62 |
+
with pdfplumber.open(pdf_path) as pdf:
|
63 |
+
for page in pdf.pages:
|
64 |
+
# Extract text from the page and normalize spaces
|
65 |
+
text = page.extract_text()
|
66 |
+
if text:
|
67 |
+
full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "
|
68 |
+
|
69 |
+
# Find all URLs in the combined text
|
70 |
+
found_urls = re.findall(link_pattern, full_text)
|
71 |
+
urls.extend(found_urls)
|
72 |
+
# elif (method == "grobid"):
|
73 |
+
# paper = pdf_to_grobid(file_name)
|
74 |
+
# found_urls = find_pattern_in_xml(paper, link_pattern)
|
75 |
+
# urls.extend(found_urls)
|
76 |
+
# os.remove(file_name)
|
77 |
+
# elif (method == "fitz")
|
78 |
+
fitz_urls = get_fitz_urls(pdf_path)
|
79 |
+
urls.extend(fitz_urls)
|
80 |
+
urls = np.unique(urls)
|
81 |
+
urls = [s for s in urls if "/" in s]
|
82 |
+
urls = [s for s in urls if "git" in s]
|
83 |
+
# else:
|
84 |
+
# raise Exception("Method unknown")
|
85 |
+
return urls
|
86 |
+
|
evaluations/utils.py
CHANGED
@@ -4,7 +4,8 @@ import time
|
|
4 |
import os
|
5 |
import zipfile
|
6 |
import json
|
7 |
-
|
|
|
8 |
|
9 |
def fetch_code(zip_file):
|
10 |
zip_content_dict = {}
|
@@ -14,27 +15,8 @@ def fetch_code(zip_file):
|
|
14 |
zip_content_dict[file_name] = file_content
|
15 |
return zip_content_dict
|
16 |
|
17 |
-
def get_api_link(url):
|
18 |
-
username, repo_name = decompose_url(url)
|
19 |
-
if (username == None):
|
20 |
-
return ""
|
21 |
-
return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
|
22 |
|
23 |
-
def
|
24 |
-
try:
|
25 |
-
url = url.split("github.com")[1]
|
26 |
-
url = url.strip(".")
|
27 |
-
url = url.split(".git")[0]
|
28 |
-
url = url.strip("/")
|
29 |
-
parts = url.split("/")
|
30 |
-
username = parts[0]
|
31 |
-
repo_name = parts[1]
|
32 |
-
return username, repo_name
|
33 |
-
except:
|
34 |
-
return None, None
|
35 |
-
|
36 |
-
|
37 |
-
def fetch_repo_stars(verbose, repo_url, token):
|
38 |
headers = {"Authorization": f"token {token}"}
|
39 |
api_url = get_api_link(repo_url)
|
40 |
api_url = api_url.replace("/zipball/", "")
|
@@ -45,38 +27,14 @@ def fetch_repo_stars(verbose, repo_url, token):
|
|
45 |
if response.status_code == 200:
|
46 |
return json.loads(response.content)["stargazers_count"]
|
47 |
if (response.status_code == 404):
|
48 |
-
|
49 |
-
|
50 |
-
def fetch_repo(verbose, repo_url, repo_name, token):
|
51 |
-
if (os.path.exists(repo_name)):
|
52 |
-
os.remove(repo_name)
|
53 |
-
|
54 |
-
if ("github.com" not in repo_url):
|
55 |
-
log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
|
56 |
-
return
|
57 |
-
|
58 |
-
headers = {"Authorization": f"token {token}"}
|
59 |
-
api_url = get_api_link(repo_url)
|
60 |
-
|
61 |
-
if (api_url == ""):
|
62 |
-
log(verbose, "ERROR", f"Failed to parse the URL, please evaluate manually ({repo_url}).")
|
63 |
-
return
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
if response.status_code == 200:
|
69 |
-
with open(repo_name, 'wb') as file:
|
70 |
-
file.write(response.content)
|
71 |
-
if (response.status_code == 404):
|
72 |
-
log(verbose, "ERROR", "Repository private / Link broken.")
|
73 |
-
|
74 |
-
def fetch_readme(zip):
|
75 |
-
readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
|
76 |
-
readme = ""
|
77 |
for readme_file in readme_files:
|
78 |
-
|
79 |
-
return
|
80 |
|
81 |
def fetch_license(zip):
|
82 |
license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
|
@@ -85,7 +43,7 @@ def fetch_license(zip):
|
|
85 |
license = zip.open(license_files[0]).read().decode("utf-8")
|
86 |
return license
|
87 |
|
88 |
-
def fetch_openalex(
|
89 |
api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
|
90 |
|
91 |
response = requests.get(api_url)
|
@@ -93,36 +51,7 @@ def fetch_openalex(verbose, paper_name, year):
|
|
93 |
if response.status_code == 200:
|
94 |
return response.json()
|
95 |
else:
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
def log(verbose, log_type, log_text, hf=False):
|
100 |
-
if (verbose == 0):
|
101 |
-
return
|
102 |
-
|
103 |
-
show_tips = (verbose == 2) | (verbose == 4)
|
104 |
-
if ((verbose == 1) | (verbose == 2)):
|
105 |
-
show = print
|
106 |
-
if ((verbose == 3) | (verbose == 4)):
|
107 |
-
show = st.write
|
108 |
-
|
109 |
-
# Align line-break
|
110 |
-
if (log_text.startswith("\n")):
|
111 |
-
show("\n")
|
112 |
-
log_text = log_text.lstrip('\n')
|
113 |
-
|
114 |
-
# Only show tips in verbose mode 2 and 4
|
115 |
-
if ((log_type == "TITLE") & show_tips):
|
116 |
-
show(f"\n#### {log_text}")
|
117 |
-
if ((log_type == "TIP") & show_tips):
|
118 |
-
show(f"*{log_text}*")
|
119 |
-
if ((log_type == "LOG") & show_tips):
|
120 |
-
show(f"{log_text}")
|
121 |
-
if ((log_type == "ERROR")):
|
122 |
-
show(f"**{log_text}**")
|
123 |
-
|
124 |
-
if ((log_type != "TIP") & (log_type != "LOG") & (log_type != "ERROR") & (log_type != "TITLE")):
|
125 |
-
raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
|
126 |
|
127 |
-
def init_llm(
|
128 |
-
|
|
|
4 |
import os
|
5 |
import zipfile
|
6 |
import json
|
7 |
+
from core.conversion import get_api_link
|
8 |
+
from core.conversion import noop_logger
|
9 |
|
10 |
def fetch_code(zip_file):
|
11 |
zip_content_dict = {}
|
|
|
15 |
zip_content_dict[file_name] = file_content
|
16 |
return zip_content_dict
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
def fetch_repo_stars(repo_url, token, log_fn=noop_logger):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
headers = {"Authorization": f"token {token}"}
|
21 |
api_url = get_api_link(repo_url)
|
22 |
api_url = api_url.replace("/zipball/", "")
|
|
|
27 |
if response.status_code == 200:
|
28 |
return json.loads(response.content)["stargazers_count"]
|
29 |
if (response.status_code == 404):
|
30 |
+
log_fn("ERROR", "Repository private.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
def fetch_readmes(zip):
|
33 |
+
readme_files = [readme for readme in zip.namelist() if (readme.lower().endswith("readme.md") & (len(readme.split("/")) == 2))]
|
34 |
+
readmes = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
for readme_file in readme_files:
|
36 |
+
readmes.append(zip.open(readme_file).read().decode("utf-8"))
|
37 |
+
return readmes
|
38 |
|
39 |
def fetch_license(zip):
|
40 |
license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
|
|
|
43 |
license = zip.open(license_files[0]).read().decode("utf-8")
|
44 |
return license
|
45 |
|
46 |
+
def fetch_openalex(paper_name, year, log_fn=noop_logger):
|
47 |
api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
|
48 |
|
49 |
response = requests.get(api_url)
|
|
|
51 |
if response.status_code == 200:
|
52 |
return response.json()
|
53 |
else:
|
54 |
+
log_fn("WARNING", "Could not find OpenAlex information for paper.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
def init_llm(log_fn=noop_logger):
|
57 |
+
log_fn("LOG", "Initializing LLM...")
|
evaluations/validating.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
from .utils import log
|
2 |
import re
|
|
|
3 |
|
4 |
-
def evaluate(
|
5 |
-
|
6 |
overall = "No"
|
7 |
patterns = {
|
8 |
'tensorflow': [
|
@@ -23,14 +23,15 @@ def evaluate(verbose, llm, zip, readme):
|
|
23 |
for framework, regex_list in patterns.items():
|
24 |
for pattern in regex_list:
|
25 |
if re.search(pattern, code):
|
26 |
-
|
27 |
overall = "Yes"
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
if (overall == "No"):
|
35 |
-
|
36 |
return overall
|
|
|
|
|
1 |
import re
|
2 |
+
from core.conversion import noop_logger
|
3 |
|
4 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
5 |
+
log_fn("TITLE", "\nLooking for examples for running the model...")
|
6 |
overall = "No"
|
7 |
patterns = {
|
8 |
'tensorflow': [
|
|
|
23 |
for framework, regex_list in patterns.items():
|
24 |
for pattern in regex_list:
|
25 |
if re.search(pattern, code):
|
26 |
+
log_fn("LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
|
27 |
overall = "Yes"
|
28 |
|
29 |
+
for readme in readmes:
|
30 |
+
if (readme):
|
31 |
+
if ((len(re.findall("testing", readme)) > 0)):
|
32 |
+
log_fn("LOG", "Found information about evaluations in readme")
|
33 |
+
overall = "Yes"
|
34 |
|
35 |
if (overall == "No"):
|
36 |
+
log_fn("ERROR", "Found no code for evaluating the model.")
|
37 |
return overall
|
evaluations/weights.py
CHANGED
@@ -1,52 +1,53 @@
|
|
1 |
-
from .utils import log
|
2 |
import re
|
|
|
3 |
|
4 |
-
def evaluate(
|
5 |
-
|
6 |
overall = "No"
|
7 |
files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
|
8 |
if (len(files) > 0):
|
9 |
-
|
10 |
overall = "Yes"
|
11 |
return overall
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
52 |
return overall
|
|
|
|
|
1 |
import re
|
2 |
+
from core.conversion import noop_logger
|
3 |
|
4 |
+
def evaluate(llm, zip, readmes, log_fn=noop_logger):
|
5 |
+
log_fn("TITLE", "\nLooking for pre-trained model weights...")
|
6 |
overall = "No"
|
7 |
files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
|
8 |
if (len(files) > 0):
|
9 |
+
log_fn("LOG", f"Found model weights: {files}")
|
10 |
overall = "Yes"
|
11 |
return overall
|
12 |
|
13 |
+
for readme in readmes:
|
14 |
+
if (readme):
|
15 |
+
|
16 |
+
url_pattern = r'(https?://[^\s]+)'
|
17 |
+
urls = re.findall(url_pattern, readme)
|
18 |
+
if (len([url for url in urls if "pth" in url]) > 0):
|
19 |
+
log_fn("LOG", "Found a link to pre-trained weights in readme")
|
20 |
+
overall = "Yes"
|
21 |
+
return overall
|
22 |
+
|
23 |
+
readme_lines = readme.split("\n")
|
24 |
+
if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
|
25 |
+
log_fn("LOG", "Found a link for 'pretrained' something in readme")
|
26 |
+
overall = "Yes"
|
27 |
+
return overall
|
28 |
+
|
29 |
+
if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
|
30 |
+
log_fn("LOG", "Found a link for 'pre-trained' something in readme")
|
31 |
+
overall = "Yes"
|
32 |
+
return overall
|
33 |
+
|
34 |
+
if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
|
35 |
+
log_fn("LOG", "Found a link for 'weight' something in readme")
|
36 |
+
overall = "Yes"
|
37 |
+
return overall
|
38 |
+
|
39 |
+
if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
|
40 |
+
log_fn("LOG", "Found a link for 'model' something in readme")
|
41 |
+
overall = "Yes"
|
42 |
+
return overall
|
43 |
+
|
44 |
+
if (llm):
|
45 |
+
prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
|
46 |
+
ans = llm.predict("STRICT", prompt)
|
47 |
+
if (("Yes" in ans) & ("No" not in ans)):
|
48 |
+
log_fn("LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
|
49 |
+
overall = "Yes"
|
50 |
+
return overall
|
51 |
+
|
52 |
+
log_fn("ERROR", "Found no pre-trained model weights.")
|
53 |
return overall
|
full_eval.py
CHANGED
@@ -1,4 +1,19 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from evaluations.repo_evaluations import process_row
|
3 |
+
from evaluations.utils import *
|
4 |
+
from concurrent.futures import ProcessPoolExecutor
|
5 |
+
from tqdm import tqdm
|
6 |
+
from core.paper import Paper
|
7 |
|
8 |
+
if __name__ == "__main__":
|
9 |
+
paper_dump = pd.read_csv("data/urls.csv", sep="\t")
|
10 |
+
|
11 |
+
max_workers = 12
|
12 |
+
papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
|
13 |
+
# papers = [paper for paper in papers if paper.main_repo_url == "https://github.com/AsukaDaisuki/MAT"]
|
14 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
15 |
+
papers = list(tqdm(executor.map(process_row, papers), total=len(papers), desc="Running repo evaluations"))
|
16 |
+
|
17 |
+
results = [p.to_dict() for p in papers]
|
18 |
+
results_df = pd.DataFrame(results)
|
19 |
+
results_df.to_csv("data/results.csv", sep="\t", index=False)
|
midl_summary.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
|
6 |
-
compare_to_gt = True
|
7 |
-
ground_truth = pd.read_csv("data/zipfiles.csv", sep="\t")
|
8 |
-
results = pd.read_csv("data/results.csv", sep="\t")
|
9 |
-
verbose = 0
|
10 |
-
|
11 |
-
eval_readme = []
|
12 |
-
eval_training = []
|
13 |
-
eval_evaluating = []
|
14 |
-
eval_licensing = []
|
15 |
-
eval_weights = []
|
16 |
-
eval_dependencies = []
|
17 |
-
full_results = []
|
18 |
-
for (index1, row1), (index2, row2) in zip(ground_truth.iterrows(), results.iterrows()):
|
19 |
-
if (pd.isna(row1["training"])):
|
20 |
-
continue
|
21 |
-
|
22 |
-
print(f"\nEvaluating {index1+1} out of {len(ground_truth.index)} papers...")
|
23 |
-
print(f'Paper title - "{row1["title"]}" ({row1["year"]})')
|
24 |
-
print(f'Repository link - {row1["url"]}')
|
25 |
-
if ((not(pd.isna(row1["dependencies"]))) & (row2["pred_dependencies"] is not None)):
|
26 |
-
eval_dependencies.append(row2["pred_dependencies"] == row1["dependencies"])
|
27 |
-
if (row2["pred_dependencies"] != row1["dependencies"]):
|
28 |
-
print(f"Dependencies acc. - {row2['pred_dependencies']} (GT:{row1['dependencies']})")
|
29 |
-
if ((not(pd.isna(row1["training"]))) & (row2["pred_dependencies"] is not None)):
|
30 |
-
eval_training.append(row1["training"] == row2["pred_training"])
|
31 |
-
if (row1["training"] != row2["pred_training"]):
|
32 |
-
print(f"Training acc. -{row2['pred_training']} (GT:{row1['training']})")
|
33 |
-
if ((not(pd.isna(row1["evaluation"]))) & (row2["pred_dependencies"] is not None)):
|
34 |
-
eval_evaluating.append(row1["evaluation"] == row2["pred_evaluation"])
|
35 |
-
if (row1["evaluation"] != row2["pred_evaluation"]):
|
36 |
-
print(f"Evaluating acc. - {row2['pred_evaluation']} (GT:{row1['evaluation']})")
|
37 |
-
if ((not(pd.isna(row1["weights"]))) & (row2["pred_dependencies"] is not None)):
|
38 |
-
eval_weights.append(row1["weights"] == row2["pred_weights"])
|
39 |
-
if (row1["weights"] != row2["pred_weights"]):
|
40 |
-
print(f"Weights acc. - {row2['pred_weights']} (GT:{row1['weights']})")
|
41 |
-
if ((not(pd.isna(row1["readme"]))) & (row2["pred_dependencies"] is not None)):
|
42 |
-
eval_readme.append(row1["readme"] == row2["pred_readme"])
|
43 |
-
if (row1["readme"] != row2["pred_readme"]):
|
44 |
-
print(f"README acc. - {row2['pred_readme']} (GT:{row1['readme']})")
|
45 |
-
if ((not(pd.isna(row1["license"]))) & (row2["pred_dependencies"] is not None)):
|
46 |
-
eval_licensing.append(("No" if row1["license"] == "No" else "Yes") == row2["pred_license"])
|
47 |
-
if (("No" if row1["license"] == "No" else "Yes") != row2["pred_license"]):
|
48 |
-
print(f"LICENSE acc. - {row2['pred_license']} (GT:{row1['license']})")
|
49 |
-
|
50 |
-
|
51 |
-
print("\nSummary:")
|
52 |
-
print(f"Dependencies acc. - {int(100 * np.mean(eval_dependencies))}%")
|
53 |
-
print(f"Training acc. - {int(100 * np.mean(eval_training))}%")
|
54 |
-
print(f"Evaluating acc. - {int(100 * np.mean(eval_evaluating))}%")
|
55 |
-
print(f"Weights acc. - {int(100 * np.mean(eval_weights))}%")
|
56 |
-
print(f"README acc. - {int(100 * np.mean(eval_readme))}%")
|
57 |
-
print(f"LICENSE acc. - {int(100 * np.mean(eval_licensing))}%")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plotting/midl_summary.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
5 |
+
sys.path.append(ROOT_DIR)
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
from core.paper import Paper
|
9 |
+
|
10 |
+
def compare(ground_truth, automated_truth, key, verbose=False):
|
11 |
+
if key not in ground_truth.keys() or key not in automated_truth.keys():
|
12 |
+
return np.nan
|
13 |
+
if (pd.isna(ground_truth[key]) or pd.isna(automated_truth[key])):
|
14 |
+
return np.nan
|
15 |
+
|
16 |
+
if (key == "license"):
|
17 |
+
ground_truth[key] = "No" if ground_truth[key] == "No" else "Yes"
|
18 |
+
res = ground_truth[key] == automated_truth[key]
|
19 |
+
if verbose and res == False:
|
20 |
+
print(f"{key} acc. - {automated_truth[key]} (GT:{ground_truth[key]})")
|
21 |
+
return res
|
22 |
+
|
23 |
+
max_workers = 6
|
24 |
+
compare_to_gt = True
|
25 |
+
verbose = True
|
26 |
+
training = True
|
27 |
+
|
28 |
+
paper_dump = pd.read_csv("data/results.csv", sep="\t")
|
29 |
+
papers = [Paper.from_row(row) for _, row in paper_dump.iterrows()]
|
30 |
+
|
31 |
+
eval_readme = []
|
32 |
+
eval_training = []
|
33 |
+
eval_evaluating = []
|
34 |
+
eval_licensing = []
|
35 |
+
eval_weights = []
|
36 |
+
eval_dependencies = []
|
37 |
+
full_results = []
|
38 |
+
for idx, paper in enumerate(papers):
|
39 |
+
if paper.venue != "MIDL" or paper.main_repo_url is None or (int(paper.year) >= 2024 if training else int(paper.year) < 2024):
|
40 |
+
continue
|
41 |
+
|
42 |
+
if (verbose):
|
43 |
+
print(f"\nEvaluating {idx} out of {len(papers)} papers...")
|
44 |
+
print(f'Paper title - "{paper.title}" ({paper.year})')
|
45 |
+
print(f'Repository link - {paper.main_repo_url}')
|
46 |
+
eval_dependencies.append(compare(paper.code_repro_manual, paper.code_repro_auto, "dependencies", verbose))
|
47 |
+
eval_training.append(compare(paper.code_repro_manual, paper.code_repro_auto, "training", verbose))
|
48 |
+
eval_evaluating.append(compare(paper.code_repro_manual, paper.code_repro_auto, "evaluation", verbose))
|
49 |
+
eval_weights.append(compare(paper.code_repro_manual, paper.code_repro_auto, "weights", verbose))
|
50 |
+
eval_readme.append(compare(paper.code_repro_manual, paper.code_repro_auto, "readme", verbose))
|
51 |
+
eval_licensing.append(compare(paper.code_repro_manual, paper.code_repro_auto, "license", verbose))
|
52 |
+
|
53 |
+
print("\nSummary:")
|
54 |
+
print(f"Dependencies acc. - {int(100 * np.nanmean(eval_dependencies))}%")
|
55 |
+
print(f"Training acc. - {int(100 * np.nanmean(eval_training))}%")
|
56 |
+
print(f"Evaluating acc. - {int(100 * np.nanmean(eval_evaluating))}%")
|
57 |
+
print(f"Weights acc. - {int(100 * np.nanmean(eval_weights))}%")
|
58 |
+
print(f"README acc. - {int(100 * np.nanmean(eval_readme))}%")
|
59 |
+
print(f"LICENSE acc. - {int(100 * np.nanmean(eval_licensing))}%")
|
plotting/paper_plots.py
CHANGED
@@ -1,41 +1,34 @@
|
|
1 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
-
|
4 |
-
paper_dump = pd.read_csv('data/dump.csv', sep="\t")
|
5 |
-
# Calculate total number of URLs per year and venue
|
6 |
-
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
7 |
-
total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
|
8 |
-
|
9 |
-
# Calculate the number of URLs with errors per year and venue
|
10 |
-
total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
|
11 |
-
|
12 |
-
# Merge the DataFrames to calculate the error rate
|
13 |
-
merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
|
14 |
-
merged_df['repo_rate'] = merged_df['total_urls'] / merged_df['total_titles']
|
15 |
-
|
16 |
-
# Plot the error rates using Plotly, with year on x-axis and color by venue
|
17 |
-
fig = px.bar(
|
18 |
-
merged_df,
|
19 |
-
x='year',
|
20 |
-
y='total_titles',
|
21 |
-
color='venue',
|
22 |
-
barmode='group',
|
23 |
-
title=f'Number of papers per venue',
|
24 |
-
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
25 |
-
category_orders={'venue': custom_order}
|
26 |
-
)
|
27 |
-
|
28 |
-
fig.update_xaxes(range=[2018, 2024])
|
29 |
-
fig.show()
|
30 |
-
|
31 |
import plotly.express as px
|
32 |
import numpy as np
|
|
|
|
|
|
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# Calculate total number of URLs per year and venue
|
35 |
total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
|
36 |
|
37 |
# Calculate the number of URLs with errors per year and venue
|
38 |
-
total_url_per_venue = paper_dump[
|
|
|
|
|
39 |
|
40 |
# Merge the DataFrames to calculate the error rate
|
41 |
merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
|
@@ -50,25 +43,10 @@ fig = px.bar(
|
|
50 |
barmode='group',
|
51 |
title=f'Number of papers per venue',
|
52 |
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
53 |
-
category_orders={'venue':
|
54 |
-
)
|
55 |
-
|
56 |
-
fig.update_xaxes(range=[2018, 2024])
|
57 |
-
fig.show()
|
58 |
-
|
59 |
-
# Plot the error rates using Plotly, with year on x-axis and color by venue
|
60 |
-
fig = px.bar(
|
61 |
-
merged_df,
|
62 |
-
x='year',
|
63 |
-
y='total_urls',
|
64 |
-
color='venue',
|
65 |
-
barmode='group',
|
66 |
-
title=f'Number of papers per venue',
|
67 |
-
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
68 |
-
category_orders={'venue': custom_order}
|
69 |
)
|
70 |
|
71 |
-
fig.update_xaxes(range=[2018,
|
72 |
fig.show()
|
73 |
|
74 |
|
@@ -81,9 +59,9 @@ fig = px.bar(
|
|
81 |
barmode='group',
|
82 |
title=f'Number of repositories per venue',
|
83 |
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
84 |
-
category_orders={'venue':
|
85 |
)
|
86 |
-
fig.update_xaxes(range=[2018,
|
87 |
fig.update_yaxes(range=[0, 1])
|
88 |
|
89 |
fig.show()
|
|
|
1 |
import plotly.express as px
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
5 |
+
sys.path.append(ROOT_DIR)
|
6 |
import numpy as np
|
7 |
+
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import plotly.express as px
|
9 |
import numpy as np
|
10 |
+
from config.constants import VENUE_ORDER
|
11 |
+
import json
|
12 |
+
from core.paper import _parse_url_field
|
13 |
|
14 |
+
paper_dump = pd.read_csv('data/urls.csv', sep="\t")
|
15 |
+
paper_dump['urls_manual'] = paper_dump['urls_manual'].apply(
|
16 |
+
lambda x: _parse_url_field(json.loads(x))
|
17 |
+
)
|
18 |
+
paper_dump['urls_auto'] = paper_dump['urls_auto'].apply(
|
19 |
+
lambda x: _parse_url_field(json.loads(x))
|
20 |
+
)
|
21 |
+
paper_dump['url'] = paper_dump.apply(
|
22 |
+
lambda row: next((u for u in [*row['urls_manual'], *row['urls_auto']] if "github.com" in u), None),
|
23 |
+
axis=1
|
24 |
+
)
|
25 |
# Calculate total number of URLs per year and venue
|
26 |
total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
|
27 |
|
28 |
# Calculate the number of URLs with errors per year and venue
|
29 |
+
total_url_per_venue = paper_dump[
|
30 |
+
paper_dump["url"].notna() & (paper_dump["url"] != "")
|
31 |
+
].groupby(['year', 'venue']).size().reset_index(name='total_urls')
|
32 |
|
33 |
# Merge the DataFrames to calculate the error rate
|
34 |
merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
|
|
|
43 |
barmode='group',
|
44 |
title=f'Number of papers per venue',
|
45 |
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
46 |
+
category_orders={'venue': VENUE_ORDER}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
|
49 |
+
fig.update_xaxes(range=[2018, 2025])
|
50 |
fig.show()
|
51 |
|
52 |
|
|
|
59 |
barmode='group',
|
60 |
title=f'Number of repositories per venue',
|
61 |
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
62 |
+
category_orders={'venue': VENUE_ORDER}
|
63 |
)
|
64 |
+
fig.update_xaxes(range=[2018, 2025])
|
65 |
fig.update_yaxes(range=[0, 1])
|
66 |
|
67 |
fig.show()
|
plotting/print_incorrect.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import plotly.express as px
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Define columns for all relevant predictions
|
6 |
+
pred_columns = ['pred_dependencies', 'pred_training',
|
7 |
+
'pred_evaluation', 'pred_weights', 'pred_readme',
|
8 |
+
'pred_license']
|
9 |
+
|
10 |
+
# Define the real and predicted column pairs
|
11 |
+
real_pred_columns = {
|
12 |
+
'dependencies': 'pred_dependencies',
|
13 |
+
'training': 'pred_training',
|
14 |
+
'evaluation': 'pred_evaluation',
|
15 |
+
'weights': 'pred_weights',
|
16 |
+
'readme': 'pred_readme',
|
17 |
+
'license': 'pred_license'
|
18 |
+
}
|
19 |
+
|
20 |
+
df = pd.read_csv('data/results.csv', sep="\t")
|
21 |
+
|
22 |
+
# Cleanup
|
23 |
+
df['year'] = pd.to_numeric(df['year'], errors='coerce')
|
24 |
+
df = df.dropna(subset=['year'])
|
25 |
+
df['year'] = df['year'].astype(int)
|
26 |
+
|
27 |
+
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
28 |
+
|
29 |
+
# Group by venue
|
30 |
+
df_filtered = df[df['pred_live'] == "Yes"].copy()
|
31 |
+
df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
|
32 |
+
|
33 |
+
# Add matching counts for each category
|
34 |
+
for real, pred in real_pred_columns.items():
|
35 |
+
df_filtered[f'matching_{real}'] = df_filtered[real] == df_filtered[pred]
|
36 |
+
|
37 |
+
for real, pred in real_pred_columns.items():
|
38 |
+
print(f"Evaluations for {real}:")
|
39 |
+
for idx, row in df_filtered.iterrows():
|
40 |
+
if ((row['year'] == 2024) | pd.isna(row["url"]) | (row["url"] == "") | (pd.isna(row[real]))):
|
41 |
+
continue
|
42 |
+
|
43 |
+
if not(row[f'matching_{real}']):
|
44 |
+
print(f"Automated test for {real} failed for link: {row['url']} [{row[real]} - {row[pred]}]")
|
plotting/result_plots.py
CHANGED
@@ -1,25 +1,66 @@
|
|
1 |
import plotly.express as px
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
df = pd.read_csv('data/results.csv', sep="\t")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
6 |
|
7 |
-
#
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')
|
12 |
|
13 |
-
# Merge the DataFrames to calculate the error rate
|
14 |
-
error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
|
15 |
-
error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
|
16 |
-
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
|
17 |
|
18 |
# Plot the error rates using Plotly, with year on x-axis and color by venue
|
19 |
fig = px.bar(
|
20 |
-
|
21 |
x='year',
|
22 |
-
y='
|
23 |
color='venue',
|
24 |
barmode='group',
|
25 |
title=f'Success Rate per Venue and Year for "valid_url"',
|
@@ -32,48 +73,53 @@ fig.update_xaxes(range=[2017.5, 2024.5])
|
|
32 |
fig.show()
|
33 |
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
# Calculate the number of URLs with errors per year and venue
|
40 |
-
passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
color='venue',
|
53 |
-
barmode='group',
|
54 |
-
title=f'Success Rate per Venue and Year for "{topic}"',
|
55 |
-
labels={'error_rate': 'Success Rate', 'year': 'Year'},
|
56 |
-
category_orders={'venue': custom_order}
|
57 |
-
)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
62 |
|
|
|
|
|
|
|
63 |
|
64 |
# List of columns to check for "No"
|
65 |
-
|
|
|
|
|
|
|
66 |
|
67 |
# Step 1: Calculate the number of "No" answers per row for the specified columns
|
68 |
-
|
69 |
|
70 |
# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
|
71 |
fig = px.scatter(
|
72 |
-
|
73 |
x='pred_citations',
|
74 |
y='no_count',
|
75 |
color='venue',
|
76 |
-
title='Number of
|
77 |
labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
|
78 |
category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
|
79 |
log_x=True
|
@@ -82,19 +128,16 @@ fig = px.scatter(
|
|
82 |
# Step 3: Display the scatter plot
|
83 |
fig.show()
|
84 |
|
85 |
-
#
|
86 |
-
|
87 |
-
|
88 |
-
# Step 1: Calculate the number of "No" answers per row for the specified columns
|
89 |
-
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
|
90 |
|
91 |
# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
|
92 |
fig = px.strip(
|
93 |
-
|
94 |
x='venue',
|
95 |
y='no_count',
|
96 |
color='venue',
|
97 |
-
title='
|
98 |
labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
|
99 |
category_orders={'venue': custom_order}, # Ensure custom order for venues
|
100 |
stripmode='overlay' # Allows all individual points to overlay each other
|
@@ -105,7 +148,7 @@ fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))
|
|
105 |
|
106 |
# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
|
107 |
fig.add_trace(px.box(
|
108 |
-
|
109 |
x='venue',
|
110 |
y='no_count',
|
111 |
category_orders={'venue': custom_order}
|
@@ -114,28 +157,38 @@ fig.add_trace(px.box(
|
|
114 |
# Step 5: Show the plot
|
115 |
fig.show()
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import plotly.express as px
|
2 |
import pandas as pd
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Define columns for all relevant predictions
|
6 |
+
pred_columns = ['pred_dependencies', 'pred_training',
|
7 |
+
'pred_evaluation', 'pred_weights', 'pred_readme',
|
8 |
+
'pred_license']
|
9 |
+
|
10 |
+
# Define the real and predicted column pairs
|
11 |
+
real_pred_columns = {
|
12 |
+
'dependencies': 'pred_dependencies',
|
13 |
+
'training': 'pred_training',
|
14 |
+
'evaluation': 'pred_evaluation',
|
15 |
+
'weights': 'pred_weights',
|
16 |
+
'readme': 'pred_readme',
|
17 |
+
'license': 'pred_license'
|
18 |
+
}
|
19 |
|
20 |
df = pd.read_csv('data/results.csv', sep="\t")
|
21 |
+
|
22 |
+
# Cleanup
|
23 |
+
df['year'] = pd.to_numeric(df['year'], errors='coerce')
|
24 |
+
df = df.dropna(subset=['year'])
|
25 |
+
df['year'] = df['year'].astype(int)
|
26 |
+
|
27 |
+
# df['venue'] = df['venue'].apply(lambda x: str(re.search(r"'(.*?)'", x).group(1)))
|
28 |
+
|
29 |
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
|
30 |
|
31 |
+
# Group by year and venue, and calculate the ratio of papers where URL is not None
|
32 |
+
df_grouped = df.groupby(['year', 'venue']).agg(
|
33 |
+
total_papers=('title', 'count'),
|
34 |
+
papers_with_url=('url', lambda x: x.notna().sum()),
|
35 |
+
valid_urls=('pred_live', lambda x: (x == "Yes").sum())
|
36 |
+
).reset_index()
|
37 |
+
|
38 |
+
df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']
|
39 |
+
|
40 |
+
# Create the plotly figure
|
41 |
+
fig = px.bar(
|
42 |
+
df_grouped,
|
43 |
+
x='year',
|
44 |
+
y='ratio',
|
45 |
+
color='venue',
|
46 |
+
barmode='group',
|
47 |
+
title=f'Success Rate per Venue and Year for "valid_url"',
|
48 |
+
labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},
|
49 |
+
category_orders={'venue': custom_order}
|
50 |
+
)
|
51 |
+
|
52 |
+
fig.update_yaxes(range=[0, 1])
|
53 |
+
fig.update_xaxes(range=[2017.5, 2024.5])
|
54 |
+
fig.show()
|
55 |
|
56 |
+
df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']
|
|
|
57 |
|
|
|
|
|
|
|
|
|
58 |
|
59 |
# Plot the error rates using Plotly, with year on x-axis and color by venue
|
60 |
fig = px.bar(
|
61 |
+
df_grouped,
|
62 |
x='year',
|
63 |
+
y='valid_ratio',
|
64 |
color='venue',
|
65 |
barmode='group',
|
66 |
title=f'Success Rate per Venue and Year for "valid_url"',
|
|
|
73 |
fig.show()
|
74 |
|
75 |
|
76 |
+
# Ensure boolean columns are actually booleans
|
77 |
+
df_new = df.copy()
|
78 |
+
for col in pred_columns:
|
79 |
+
df_new[col] = df_new[col] == "Yes"
|
80 |
+
|
81 |
+
df_grouped = df_new.groupby('venue').agg(
|
82 |
+
valid_urls=('pred_live', lambda x: (x == "Yes").sum()),
|
83 |
+
**{col: (col, lambda x: x[df_new['pred_live'] == "Yes"].sum()) for col in pred_columns}
|
84 |
+
).reset_index()
|
85 |
|
|
|
|
|
86 |
|
87 |
+
# Calculate the ratio for each prediction column
|
88 |
+
for col in pred_columns:
|
89 |
+
df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']
|
|
|
90 |
|
91 |
+
# Melt the dataframe for easier plotting
|
92 |
+
df_melted = df_grouped.melt(id_vars=['venue'],
|
93 |
+
value_vars=pred_columns,
|
94 |
+
var_name='Prediction Type',
|
95 |
+
value_name='Ratio')
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
# Create a grouped bar plot
|
98 |
+
fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',
|
99 |
+
barmode='group', # Ensures bars are side by side
|
100 |
+
category_orders={'venue': custom_order},
|
101 |
+
title='Ratio of Predictions by Venue')
|
102 |
|
103 |
+
# Show the figure
|
104 |
+
fig.update_yaxes(range=[0, 1])
|
105 |
+
fig.show()
|
106 |
|
107 |
# List of columns to check for "No"
|
108 |
+
# Step 1: Filter only rows where pred_live is "Yes"
|
109 |
+
df_filtered = df[df['pred_live'] == "Yes"].copy()
|
110 |
+
for col in pred_columns:
|
111 |
+
df_filtered[col] = df_filtered[col] == "Yes"
|
112 |
|
113 |
# Step 1: Calculate the number of "No" answers per row for the specified columns
|
114 |
+
df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)
|
115 |
|
116 |
# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
|
117 |
fig = px.scatter(
|
118 |
+
df_filtered,
|
119 |
x='pred_citations',
|
120 |
y='no_count',
|
121 |
color='venue',
|
122 |
+
title='Number of passed tests, Color Coded by Venue',
|
123 |
labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
|
124 |
category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
|
125 |
log_x=True
|
|
|
128 |
# Step 3: Display the scatter plot
|
129 |
fig.show()
|
130 |
|
131 |
+
# [np.corrcoef(np.array(df_filtered[col][~(pd.isna(df_filtered['pred_citations']))], dtype=int), df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])[0, 1] for col in pred_columns]
|
132 |
+
# np.corrcoef(np.array(df_filtered['no_count'][~(pd.isna(df_filtered['pred_citations']))]), (1 + np.array(df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])))
|
|
|
|
|
|
|
133 |
|
134 |
# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
|
135 |
fig = px.strip(
|
136 |
+
df_filtered,
|
137 |
x='venue',
|
138 |
y='no_count',
|
139 |
color='venue',
|
140 |
+
title='Automated Reproducibility Score per Venue',
|
141 |
labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
|
142 |
category_orders={'venue': custom_order}, # Ensure custom order for venues
|
143 |
stripmode='overlay' # Allows all individual points to overlay each other
|
|
|
148 |
|
149 |
# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
|
150 |
fig.add_trace(px.box(
|
151 |
+
df_filtered,
|
152 |
x='venue',
|
153 |
y='no_count',
|
154 |
category_orders={'venue': custom_order}
|
|
|
157 |
# Step 5: Show the plot
|
158 |
fig.show()
|
159 |
|
160 |
+
# Group by venue
|
161 |
+
df_filtered = df[df['pred_live'] == "Yes"].copy()
|
162 |
+
df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
|
163 |
+
df_grouped = df_filtered.groupby('venue').agg(
|
164 |
+
total_papers=('title', 'count')
|
165 |
+
).reset_index()
|
166 |
+
|
167 |
+
# Add matching counts for each category
|
168 |
+
for real, pred in real_pred_columns.items():
|
169 |
+
df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)
|
170 |
+
|
171 |
+
# Compute the ratio for each category
|
172 |
+
for real in real_pred_columns.keys():
|
173 |
+
df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']
|
174 |
+
|
175 |
+
# Melt the dataframe for visualization
|
176 |
+
df_melted = df_grouped.melt(id_vars=['venue'],
|
177 |
+
value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()],
|
178 |
+
var_name='Category',
|
179 |
+
value_name='Ratio')
|
180 |
+
|
181 |
+
# Clean up category names
|
182 |
+
df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()
|
183 |
+
|
184 |
+
# Create the bar plot
|
185 |
+
fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',
|
186 |
+
barmode='group',
|
187 |
+
title='Ratio of Matching Real vs Predicted Categories by Venue',
|
188 |
+
labels={'Ratio': 'Ratio of Matches'})
|
189 |
+
|
190 |
+
# Ensure y-axis range is between 0 and 1
|
191 |
+
fig.update_yaxes(range=[0, 1])
|
192 |
+
|
193 |
+
# Show the figure
|
194 |
+
fig.show()
|
plotting/results.ipynb
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {
|
7 |
+
"vscode": {
|
8 |
+
"languageId": "plaintext"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"outputs": [],
|
12 |
+
"source": [
|
13 |
+
"import plotly.express as px\n",
|
14 |
+
"import pandas as pd\n",
|
15 |
+
"import re\n",
|
16 |
+
"\n",
|
17 |
+
"# Define columns for all relevant predictions\n",
|
18 |
+
"pred_columns = ['pred_dependencies', 'pred_training', \n",
|
19 |
+
" 'pred_evaluation', 'pred_weights', 'pred_readme', \n",
|
20 |
+
" 'pred_license']\n",
|
21 |
+
"\n",
|
22 |
+
"# Define the real and predicted column pairs\n",
|
23 |
+
"real_pred_columns = {\n",
|
24 |
+
" 'dependencies': 'pred_dependencies',\n",
|
25 |
+
" 'training': 'pred_training',\n",
|
26 |
+
" 'evaluation': 'pred_evaluation',\n",
|
27 |
+
" 'weights': 'pred_weights',\n",
|
28 |
+
" 'readme': 'pred_readme',\n",
|
29 |
+
" 'license': 'pred_license'\n",
|
30 |
+
"}\n"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": null,
|
36 |
+
"metadata": {
|
37 |
+
"vscode": {
|
38 |
+
"languageId": "plaintext"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"\n",
|
44 |
+
"df = pd.read_csv('data/results.csv', sep=\"\\t\")\n",
|
45 |
+
"\n",
|
46 |
+
"# Cleanup\n",
|
47 |
+
"df['year'] = pd.to_numeric(df['year'], errors='coerce')\n",
|
48 |
+
"df = df.dropna(subset=['year'])\n",
|
49 |
+
"df['year'] = df['year'].astype(int)\n",
|
50 |
+
"\n",
|
51 |
+
"df['venue'] = df['venue'].apply(lambda x: str(re.search(r\"'(.*?)'\", x).group(1)))\n",
|
52 |
+
"\n",
|
53 |
+
"custom_order = [\"MICCAI\", \"MIDL\", \"Nature\", \"arXiv\"]\n",
|
54 |
+
"\n",
|
55 |
+
"# Group by year and venue, and calculate the ratio of papers where URL is not None\n",
|
56 |
+
"df_grouped = df.groupby(['year', 'venue']).agg(\n",
|
57 |
+
" total_papers=('title', 'count'),\n",
|
58 |
+
" papers_with_url=('url', lambda x: x.notna().sum()),\n",
|
59 |
+
" valid_urls=('pred_live', lambda x: (x == \"Yes\").sum())\n",
|
60 |
+
").reset_index()\n",
|
61 |
+
"\n",
|
62 |
+
"df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']\n",
|
63 |
+
"\n",
|
64 |
+
"# Create the plotly figure\n",
|
65 |
+
"fig = px.bar(\n",
|
66 |
+
" df_grouped,\n",
|
67 |
+
" x='year',\n",
|
68 |
+
" y='ratio',\n",
|
69 |
+
" color='venue',\n",
|
70 |
+
" barmode='group',\n",
|
71 |
+
" title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
|
72 |
+
" labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},\n",
|
73 |
+
" category_orders={'venue': custom_order}\n",
|
74 |
+
")\n",
|
75 |
+
"\n",
|
76 |
+
"fig.update_yaxes(range=[0, 1])\n",
|
77 |
+
"fig.update_xaxes(range=[2017.5, 2024.5])\n",
|
78 |
+
"fig.show()\n",
|
79 |
+
"\n",
|
80 |
+
"df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']\n",
|
81 |
+
"\n",
|
82 |
+
"\n",
|
83 |
+
"# Plot the error rates using Plotly, with year on x-axis and color by venue\n",
|
84 |
+
"fig = px.bar(\n",
|
85 |
+
" df_grouped,\n",
|
86 |
+
" x='year',\n",
|
87 |
+
" y='valid_ratio',\n",
|
88 |
+
" color='venue',\n",
|
89 |
+
" barmode='group',\n",
|
90 |
+
" title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
|
91 |
+
" labels={'error_rate': 'Success Rate', 'year': 'Year'},\n",
|
92 |
+
" category_orders={'venue': custom_order}\n",
|
93 |
+
")\n",
|
94 |
+
"\n",
|
95 |
+
"fig.update_yaxes(range=[0, 1])\n",
|
96 |
+
"fig.update_xaxes(range=[2017.5, 2024.5])\n",
|
97 |
+
"fig.show()\n",
|
98 |
+
"\n",
|
99 |
+
"\n",
|
100 |
+
"# Ensure boolean columns are actually booleans\n",
|
101 |
+
"df_new = df.copy()\n",
|
102 |
+
"for col in pred_columns:\n",
|
103 |
+
" df_new[col] = df_new[col] == \"Yes\"\n",
|
104 |
+
"\n",
|
105 |
+
"df_grouped = df_new.groupby('venue').agg(\n",
|
106 |
+
" valid_urls=('pred_live', lambda x: (x == \"Yes\").sum()),\n",
|
107 |
+
" **{col: (col, lambda x: x[df_new['pred_live'] == \"Yes\"].sum()) for col in pred_columns} \n",
|
108 |
+
").reset_index()\n",
|
109 |
+
"\n",
|
110 |
+
"\n",
|
111 |
+
"# Calculate the ratio for each prediction column\n",
|
112 |
+
"for col in pred_columns:\n",
|
113 |
+
" df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']\n",
|
114 |
+
"\n",
|
115 |
+
"# Melt the dataframe for easier plotting\n",
|
116 |
+
"df_melted = df_grouped.melt(id_vars=['venue'], \n",
|
117 |
+
" value_vars=pred_columns, \n",
|
118 |
+
" var_name='Prediction Type', \n",
|
119 |
+
" value_name='Ratio')\n",
|
120 |
+
"\n",
|
121 |
+
"# Create a grouped bar plot\n",
|
122 |
+
"fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',\n",
|
123 |
+
" barmode='group', # Ensures bars are side by side\n",
|
124 |
+
" category_orders={'venue': custom_order},\n",
|
125 |
+
" title='Ratio of Predictions by Venue')\n",
|
126 |
+
"\n",
|
127 |
+
"# Show the figure\n",
|
128 |
+
"fig.update_yaxes(range=[0, 1])\n",
|
129 |
+
"fig.show()\n",
|
130 |
+
"\n",
|
131 |
+
"# List of columns to check for \"No\"\n",
|
132 |
+
"# Step 1: Filter only rows where pred_live is \"Yes\"\n",
|
133 |
+
"df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
|
134 |
+
"for col in pred_columns:\n",
|
135 |
+
" df_filtered[col] = df_filtered[col] == \"Yes\"\n",
|
136 |
+
"\n",
|
137 |
+
"# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
|
138 |
+
"df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
|
139 |
+
"\n",
|
140 |
+
"# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue\n",
|
141 |
+
"fig = px.scatter(\n",
|
142 |
+
" df_filtered,\n",
|
143 |
+
" x='pred_citations',\n",
|
144 |
+
" y='no_count',\n",
|
145 |
+
" color='venue',\n",
|
146 |
+
" title='Number of passed tests, Color Coded by Venue',\n",
|
147 |
+
" labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},\n",
|
148 |
+
" category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary\n",
|
149 |
+
" log_x=True\n",
|
150 |
+
")\n",
|
151 |
+
"\n",
|
152 |
+
"# Step 3: Display the scatter plot\n",
|
153 |
+
"fig.show()\n",
|
154 |
+
"\n",
|
155 |
+
"# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
|
156 |
+
"df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
|
157 |
+
"\n",
|
158 |
+
"# Step 2: Create a strip plot (scatter-like) with jitter to show individual \"No\" counts\n",
|
159 |
+
"fig = px.strip(\n",
|
160 |
+
" df_filtered,\n",
|
161 |
+
" x='venue',\n",
|
162 |
+
" y='no_count',\n",
|
163 |
+
" color='venue',\n",
|
164 |
+
" title='Automated Reproducibility Score per Venue',\n",
|
165 |
+
" labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},\n",
|
166 |
+
" category_orders={'venue': custom_order}, # Ensure custom order for venues\n",
|
167 |
+
" stripmode='overlay' # Allows all individual points to overlay each other\n",
|
168 |
+
")\n",
|
169 |
+
"\n",
|
170 |
+
"# Step 3: Add some jitter to the x-axis so points don't overlap\n",
|
171 |
+
"fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))\n",
|
172 |
+
"\n",
|
173 |
+
"# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread\n",
|
174 |
+
"fig.add_trace(px.box(\n",
|
175 |
+
" df_filtered,\n",
|
176 |
+
" x='venue',\n",
|
177 |
+
" y='no_count',\n",
|
178 |
+
" category_orders={'venue': custom_order}\n",
|
179 |
+
").data[0]) # We add the first trace of the box plot to overlay\n",
|
180 |
+
"\n",
|
181 |
+
"# Step 5: Show the plot\n",
|
182 |
+
"fig.show()\n"
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"execution_count": null,
|
188 |
+
"metadata": {
|
189 |
+
"vscode": {
|
190 |
+
"languageId": "plaintext"
|
191 |
+
}
|
192 |
+
},
|
193 |
+
"outputs": [],
|
194 |
+
"source": [
|
195 |
+
"\n",
|
196 |
+
"# Group by venue\n",
|
197 |
+
"df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
|
198 |
+
"df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == \"No\") | (pd.isna(row))) else \"Yes\")\n",
|
199 |
+
"df_grouped = df_filtered.groupby('venue').agg(\n",
|
200 |
+
" total_papers=('title', 'count')\n",
|
201 |
+
").reset_index()\n",
|
202 |
+
"\n",
|
203 |
+
"# Add matching counts for each category\n",
|
204 |
+
"for real, pred in real_pred_columns.items():\n",
|
205 |
+
" df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)\n",
|
206 |
+
"\n",
|
207 |
+
"# Compute the ratio for each category\n",
|
208 |
+
"for real in real_pred_columns.keys():\n",
|
209 |
+
" df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']\n",
|
210 |
+
"\n",
|
211 |
+
"# Melt the dataframe for visualization\n",
|
212 |
+
"df_melted = df_grouped.melt(id_vars=['venue'], \n",
|
213 |
+
" value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], \n",
|
214 |
+
" var_name='Category', \n",
|
215 |
+
" value_name='Ratio')\n",
|
216 |
+
"\n",
|
217 |
+
"# Clean up category names\n",
|
218 |
+
"df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()\n",
|
219 |
+
"\n",
|
220 |
+
"# Create the bar plot\n",
|
221 |
+
"fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',\n",
|
222 |
+
" barmode='group', \n",
|
223 |
+
" title='Ratio of Matching Real vs Predicted Categories by Venue',\n",
|
224 |
+
" labels={'Ratio': 'Ratio of Matches'})\n",
|
225 |
+
"\n",
|
226 |
+
"# Ensure y-axis range is between 0 and 1\n",
|
227 |
+
"fig.update_yaxes(range=[0, 1])\n",
|
228 |
+
"\n",
|
229 |
+
"# Show the figure\n",
|
230 |
+
"fig.show()"
|
231 |
+
]
|
232 |
+
}
|
233 |
+
],
|
234 |
+
"metadata": {
|
235 |
+
"language_info": {
|
236 |
+
"name": "python"
|
237 |
+
}
|
238 |
+
},
|
239 |
+
"nbformat": 4,
|
240 |
+
"nbformat_minor": 2
|
241 |
+
}
|
plotting/urls.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import ast
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
7 |
+
sys.path.append(ROOT_DIR)
|
8 |
+
from core.paper import Paper
|
9 |
+
|
10 |
+
df = pd.read_csv('data/urls.csv', sep="\t")
|
11 |
+
success = 0
|
12 |
+
total = 0
|
13 |
+
papers = [Paper.from_row(row) for _, row in df.iterrows()]
|
14 |
+
|
15 |
+
def normalize_url(url):
|
16 |
+
return url.strip().lower().rstrip("/")
|
17 |
+
|
18 |
+
tp, fp, fn = 0, 0, 0
|
19 |
+
for paper in papers:
|
20 |
+
if (paper.venue == "MICCAI"):
|
21 |
+
continue
|
22 |
+
|
23 |
+
urls_auto = [normalize_url(u) for u in paper.urls_auto]
|
24 |
+
urls_manual = [normalize_url(u) for u in paper.urls_manual]
|
25 |
+
|
26 |
+
auto_set = set(urls_auto)
|
27 |
+
manual_set = set(urls_manual)
|
28 |
+
|
29 |
+
tp += len(auto_set & manual_set)
|
30 |
+
fp += len(auto_set - manual_set)
|
31 |
+
fn += len(manual_set - auto_set)
|
32 |
+
|
33 |
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
34 |
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
35 |
+
|
36 |
+
print(f"Precision: {precision:.3f}")
|
37 |
+
print(f"Recall: {recall:.3f}")
|