reproduce / data_generation /fetch_processed.py
Attila Simkó
new code
8dc0a07
import sys
import os
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(ROOT_DIR)
import csv
import numpy as np
import pandas as pd
import re
import os
from core.paper import Paper
import sys
import os
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(ROOT_DIR)
import csv
import numpy as np
import sys
import pandas as pd
import re
from evaluations.utils import *
import pandas as pd
from evaluations.url import fetch_url
from concurrent.futures import ThreadPoolExecutor
import os
from core.paper import Paper
from core.conversion import download_repo
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import sys
import os
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(ROOT_DIR)
import csv
import numpy as np
import sys
import pandas as pd
import re
from evaluations.utils import *
import pandas as pd
from evaluations.url import fetch_url
from concurrent.futures import ThreadPoolExecutor
import os
from core.paper import Paper
from core.conversion import download_repo, pdf_to_grobid
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from config.constants import VENUE_ORDER
import pandas as pd
import sys
import os
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(ROOT_DIR)
from evaluations.url import fetch_url
from concurrent.futures import ThreadPoolExecutor
import os
from core.paper import Paper
from core.conversion import download_pdf
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
def get_urls_and_assign(paper):
try:
urls = fetch_url(paper.pdf_path)
paper.urls_auto = urls # Just update this
except Exception as e:
paper.log("ERROR", str(e))
return paper
def download_xml(paper):
try:
if (paper.pdf_path is None):
return paper
if (os.path.exists(paper.xml_path)):
paper.log("NOTE", f"XML already exists for {paper.paper_id}, skipping download.")
return paper
pdf_to_grobid(paper.pdf_path, paper.xml_path)
return paper
except Exception as e:
paper.log("ERROR", f"Repo download failed: {e}")
return paper
max_workers = 2
if __name__ == "__main__":
for venue in VENUE_ORDER:
df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
df = df.replace('\t', ' ', regex=True)
df = df.replace('[]', '')
df.to_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
papers = []
for venue in VENUE_ORDER:
paper_list = pd.read_csv(f'data/online_sheet/online_{venue}.csv', sep="\t")
paper_list["Venue"] = venue
for _, row in paper_list.iterrows():
if (row.iloc[0] == ""):
continue
if (row.iloc[1] == ""):
continue
papers.append(Paper.from_raw(row))
with ProcessPoolExecutor(max_workers=max_workers) as executor:
papers = list(tqdm(executor.map(download_pdf, papers), total=len(papers), desc="Downloading PDFs"))
with ProcessPoolExecutor(max_workers=max_workers) as executor:
papers = list(tqdm(executor.map(get_urls_and_assign, papers), total=len(papers), desc="Extracting URLs"))
with ProcessPoolExecutor(max_workers=max_workers) as executor:
papers = list(tqdm(executor.map(download_repo, papers), total=len(papers), desc="Downloading GitHub repos"))
with ProcessPoolExecutor(max_workers=max_workers) as executor:
papers = list(tqdm(executor.map(download_xml, papers), total=len(papers), desc="Downloading Grobid XMLs"))
results = [p.to_dict() for p in papers]
results_df = pd.DataFrame(results)
results_df.to_csv("data/papers.csv", sep="\t", index=False)