Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

reproduce / data_generation /paper_scraping /fetch_arxiv.py

Attila Simkó

big upgrade

2db37b1 8 days ago

3.26 kB

	import pandas as pd
	import requests
	import pdfplumber
	import numpy as np
	import re
	from multiprocessing import Pool, cpu_count
	from functools import partial
	import urllib, urllib.request
	import os
	import sys
	from tqdm import tqdm
	from tqdm.contrib.concurrent import process_map # better for multiprocessing
	import feedparser
	import time
	from datetime import datetime
	from tqdm import tqdm

	ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
	sys.path.append(ROOT_DIR)

	# Function to process each URL
	def process_arxiv_paper(article_link):
	try:
	article_text = requests.get(article_link).text
	title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
	title = re.findall(title_pattern, article_text, re.DOTALL)[0]
	year_pattern = r'\[Submitted on(?:.?(\d{1,2} \w+ \d{4}))(?:.?)]'
	year = re.findall(year_pattern, article_text)[0].split(" ")[-1]

	article_id = article_link.split("/")[-1]
	pdf_url = f'https://arxiv.org/pdf/{article_id}'

	urls = [] # fetch_url(pdf_url)

	return {"title": title, "year": year, "pdf": pdf_url, "url": urls}

	except Exception as e:
	print(f"Error processing {article_link}: {e}")
	return None

	page_size = 100
	base_query = "http://export.arxiv.org/api/query"
	query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"

	articles = []
	start = 0
	max_empty_pages = 3 # stop early if we hit consecutive empty pages
	empty_pages = 0


	# Parallel processing using Pool
	if __name__ == "__main__":
	print("Fetching arXiv article URLs...")

	while True:
	# Build URL with pagination
	url = f"{base_query}?{query_params}&start={start}"

	# Parse the Atom feed
	feed = feedparser.parse(url)
	entries = feed.entries

	if not entries:
	empty_pages += 1
	print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}")
	if empty_pages >= max_empty_pages:
	print("Stopping early due to repeated empty results.")
	break
	time.sleep(4)
	start += page_size
	continue

	empty_pages = 0 # reset empty count on success

	for entry in entries:
	pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
	if pub_date >= datetime(2018, 1, 1):
	articles.append(entry.link)

	# Log progress
	print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")

	# Stop if fewer than full page — probably the last one
	if len(entries) < page_size:
	print("Reached last page of results.")
	break

	start += page_size
	time.sleep(4) # Respect rate limit
	articles = np.unique(articles)
	results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")

	# Filter out any None results due to errors
	results = [result for result in results if result is not None]

	# Convert the list of dictionaries to a DataFrame
	arxiv = pd.DataFrame(results)
	arxiv.to_csv('data/raw/arxiv.csv')