import sys import os ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(ROOT_DIR) import csv import numpy as np import pandas as pd import re import os from core.paper import Paper import sys import os ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(ROOT_DIR) import csv import numpy as np import sys import pandas as pd import re from evaluations.utils import * import pandas as pd from evaluations.url import fetch_url from concurrent.futures import ThreadPoolExecutor import os from core.paper import Paper from core.conversion import download_repo from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor import sys import os ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(ROOT_DIR) import csv import numpy as np import sys import pandas as pd import re from evaluations.utils import * import pandas as pd from evaluations.url import fetch_url from concurrent.futures import ThreadPoolExecutor import os from core.paper import Paper from core.conversion import download_repo, pdf_to_grobid from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor from config.constants import VENUE_ORDER import pandas as pd import sys import os ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(ROOT_DIR) from evaluations.url import fetch_url from concurrent.futures import ThreadPoolExecutor import os from core.paper import Paper from core.conversion import download_pdf from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor def get_urls_and_assign(paper): try: urls = fetch_url(paper.pdf_path) paper.urls_auto = urls # Just update this except Exception as e: paper.log("ERROR", str(e)) return paper def download_xml(paper): try: if (paper.pdf_path is None): return paper if (os.path.exists(paper.xml_path)): paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.") return paper pdf_to_grobid(paper.pdf_path, paper.xml_path) return paper except Exception as e: paper.log("ERROR", f"Repo download failed: {e}") return paper max_workers = 2 if __name__ == "__main__": for venue in VENUE_ORDER: df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue) df = df.replace('\t', ' ', regex=True) df = df.replace('[]', '') df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t") papers = [] for venue in VENUE_ORDER: paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t") paper_list["Venue"] = venue for _, row in paper_list.iterrows(): if (row.iloc[0] == ""): continue if (row.iloc[1] == ""): continue papers.append(Paper.from_raw(row)) with ProcessPoolExecutor(max_workers=max_workers) as executor: papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs")) with ProcessPoolExecutor(max_workers=max_workers) as executor: papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs")) with ProcessPoolExecutor(max_workers=max_workers) as executor: papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos")) with ProcessPoolExecutor(max_workers=max_workers) as executor: papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs")) results = [p.to_dict() for p in papers] results_df = pd.DataFrame(results) results_df.to_csv("data/papers.csv", sep="\t", index=False)