import pandas as pd import requests import pdfplumber import numpy as np import re from multiprocessing import Pool, cpu_count from functools import partial import urllib, urllib.request import os import sys from tqdm import tqdm from tqdm.contrib.concurrent import process_map # better for multiprocessing import feedparser import time from datetime import datetime from tqdm import tqdm ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) sys.path.append(ROOT_DIR) # Function to process each URL def process_arxiv_paper(article_link): try: article_text = requests.get(article_link).text title_pattern = r'

Title:(.*?)

' title = re.findall(title_pattern, article_text, re.DOTALL)[0] year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]' year = re.findall(year_pattern, article_text)[0].split(" ")[-1] article_id = article_link.split("/")[-1] pdf_url = f'https://arxiv.org/pdf/{article_id}' urls = [] # fetch_url(pdf_url) return {"title": title, "year": year, "pdf": pdf_url, "url": urls} except Exception as e: print(f"Error processing {article_link}: {e}") return None page_size = 100 base_query = "http://export.arxiv.org/api/query" query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100" articles = [] start = 0 max_empty_pages = 3 # stop early if we hit consecutive empty pages empty_pages = 0 # Parallel processing using Pool if __name__ == "__main__": print("Fetching arXiv article URLs...") while True: # Build URL with pagination url = f"{base_query}?{query_params}&start={start}" # Parse the Atom feed feed = feedparser.parse(url) entries = feed.entries if not entries: empty_pages += 1 print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}") if empty_pages >= max_empty_pages: print("Stopping early due to repeated empty results.") break time.sleep(4) start += page_size continue empty_pages = 0 # reset empty count on success for entry in entries: pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") if pub_date >= datetime(2018, 1, 1): articles.append(entry.link) # Log progress print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}") # Stop if fewer than full page — probably the last one if len(entries) < page_size: print("Reached last page of results.") break start += page_size time.sleep(4) # Respect rate limit articles = np.unique(articles) results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles") # Filter out any None results due to errors results = [result for result in results if result is not None] # Convert the list of dictionaries to a DataFrame arxiv = pd.DataFrame(results) arxiv.to_csv('data/raw/arxiv.csv')