Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import requests | |
import pdfplumber | |
import numpy as np | |
import re | |
from multiprocessing import Pool, cpu_count | |
from functools import partial | |
import urllib, urllib.request | |
import os | |
import sys | |
from tqdm import tqdm | |
from tqdm.contrib.concurrent import process_map # better for multiprocessing | |
import feedparser | |
import time | |
from datetime import datetime | |
from tqdm import tqdm | |
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) | |
sys.path.append(ROOT_DIR) | |
# Function to process each URL | |
def process_arxiv_paper(article_link): | |
try: | |
article_text = requests.get(article_link).text | |
title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>' | |
title = re.findall(title_pattern, article_text, re.DOTALL)[0] | |
year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]' | |
year = re.findall(year_pattern, article_text)[0].split(" ")[-1] | |
article_id = article_link.split("/")[-1] | |
pdf_url = f'https://arxiv.org/pdf/{article_id}' | |
urls = [] # fetch_url(pdf_url) | |
return {"title": title, "year": year, "pdf": pdf_url, "url": urls} | |
except Exception as e: | |
print(f"Error processing {article_link}: {e}") | |
return None | |
page_size = 100 | |
base_query = "http://export.arxiv.org/api/query" | |
query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100" | |
articles = [] | |
start = 0 | |
max_empty_pages = 3 # stop early if we hit consecutive empty pages | |
empty_pages = 0 | |
# Parallel processing using Pool | |
if __name__ == "__main__": | |
print("Fetching arXiv article URLs...") | |
while True: | |
# Build URL with pagination | |
url = f"{base_query}?{query_params}&start={start}" | |
# Parse the Atom feed | |
feed = feedparser.parse(url) | |
entries = feed.entries | |
if not entries: | |
empty_pages += 1 | |
print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}") | |
if empty_pages >= max_empty_pages: | |
print("Stopping early due to repeated empty results.") | |
break | |
time.sleep(4) | |
start += page_size | |
continue | |
empty_pages = 0 # reset empty count on success | |
for entry in entries: | |
pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") | |
if pub_date >= datetime(2018, 1, 1): | |
articles.append(entry.link) | |
# Log progress | |
print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}") | |
# Stop if fewer than full page — probably the last one | |
if len(entries) < page_size: | |
print("Reached last page of results.") | |
break | |
start += page_size | |
time.sleep(4) # Respect rate limit | |
articles = np.unique(articles) | |
results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles") | |
# Filter out any None results due to errors | |
results = [result for result in results if result is not None] | |
# Convert the list of dictionaries to a DataFrame | |
arxiv = pd.DataFrame(results) | |
arxiv.to_csv('data/raw/arxiv.csv') | |