Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import sys | |
import os | |
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
sys.path.append(ROOT_DIR) | |
import csv | |
import numpy as np | |
import pandas as pd | |
import re | |
import os | |
from core.paper import Paper | |
import sys | |
import os | |
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
sys.path.append(ROOT_DIR) | |
import csv | |
import numpy as np | |
import sys | |
import pandas as pd | |
import re | |
from evaluations.utils import * | |
import pandas as pd | |
from evaluations.url import fetch_url | |
from concurrent.futures import ThreadPoolExecutor | |
import os | |
from core.paper import Paper | |
from core.conversion import download_repo | |
from tqdm import tqdm | |
from concurrent.futures import ProcessPoolExecutor | |
import sys | |
import os | |
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
sys.path.append(ROOT_DIR) | |
import csv | |
import numpy as np | |
import sys | |
import pandas as pd | |
import re | |
from evaluations.utils import * | |
import pandas as pd | |
from evaluations.url import fetch_url | |
from concurrent.futures import ThreadPoolExecutor | |
import os | |
from core.paper import Paper | |
from core.conversion import download_repo, pdf_to_grobid | |
from tqdm import tqdm | |
from concurrent.futures import ProcessPoolExecutor | |
from config.constants import VENUE_ORDER | |
import pandas as pd | |
import sys | |
import os | |
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) | |
sys.path.append(ROOT_DIR) | |
from evaluations.url import fetch_url | |
from concurrent.futures import ThreadPoolExecutor | |
import os | |
from core.paper import Paper | |
from core.conversion import download_pdf | |
from tqdm import tqdm | |
from concurrent.futures import ProcessPoolExecutor | |
def get_urls_and_assign(paper): | |
try: | |
urls = fetch_url(paper.pdf_path) | |
paper.urls_auto = urls # Just update this | |
except Exception as e: | |
paper.log("ERROR", str(e)) | |
return paper | |
def download_xml(paper): | |
try: | |
if (paper.pdf_path is None): | |
return paper | |
if (os.path.exists(paper.xml_path)): | |
paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.") | |
return paper | |
pdf_to_grobid(paper.pdf_path, paper.xml_path) | |
return paper | |
except Exception as e: | |
paper.log("ERROR", f"Repo download failed: {e}") | |
return paper | |
max_workers = 2 | |
if __name__ == "__main__": | |
for venue in VENUE_ORDER: | |
df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue) | |
df = df.replace('\t', ' ', regex=True) | |
df = df.replace('[]', '') | |
df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t") | |
papers = [] | |
for venue in VENUE_ORDER: | |
paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t") | |
paper_list["Venue"] = venue | |
for _, row in paper_list.iterrows(): | |
if (row.iloc[0] == ""): | |
continue | |
if (row.iloc[1] == ""): | |
continue | |
papers.append(Paper.from_raw(row)) | |
with ProcessPoolExecutor(max_workers=max_workers) as executor: | |
papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs")) | |
with ProcessPoolExecutor(max_workers=max_workers) as executor: | |
papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs")) | |
with ProcessPoolExecutor(max_workers=max_workers) as executor: | |
papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos")) | |
with ProcessPoolExecutor(max_workers=max_workers) as executor: | |
papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs")) | |
results = [p.to_dict() for p in papers] | |
results_df = pd.DataFrame(results) | |
results_df.to_csv("data/papers.csv", sep="\t", index=False) | |