File size: 3,257 Bytes
2db37b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
import requests
import pdfplumber
import numpy as np
import re
from multiprocessing import Pool, cpu_count
from functools import partial
import urllib, urllib.request
import os
import sys
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map  # better for multiprocessing
import feedparser
import time
from datetime import datetime
from tqdm import tqdm

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(ROOT_DIR)

# Function to process each URL
def process_arxiv_paper(article_link):
    try:
        article_text = requests.get(article_link).text
        title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
        title = re.findall(title_pattern, article_text, re.DOTALL)[0]
        year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]'
        year = re.findall(year_pattern, article_text)[0].split(" ")[-1]

        article_id = article_link.split("/")[-1]
        pdf_url = f'https://arxiv.org/pdf/{article_id}'

        urls = [] # fetch_url(pdf_url)
        
        return {"title": title, "year": year, "pdf": pdf_url, "url": urls}

    except Exception as e:
        print(f"Error processing {article_link}: {e}")
        return None

page_size = 100
base_query = "http://export.arxiv.org/api/query"
query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"

articles = []
start = 0
max_empty_pages = 3  # stop early if we hit consecutive empty pages
empty_pages = 0


# Parallel processing using Pool
if __name__ == "__main__":
    print("Fetching arXiv article URLs...")

    while True:
        # Build URL with pagination
        url = f"{base_query}?{query_params}&start={start}"
        
        # Parse the Atom feed
        feed = feedparser.parse(url)
        entries = feed.entries

        if not entries:
            empty_pages += 1
            print(f"⚠️  Empty page at start={start}. Consecutive empty pages: {empty_pages}")
            if empty_pages >= max_empty_pages:
                print("Stopping early due to repeated empty results.")
                break
            time.sleep(4)
            start += page_size
            continue

        empty_pages = 0  # reset empty count on success

        for entry in entries:
            pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            if pub_date >= datetime(2018, 1, 1):
                articles.append(entry.link)

        # Log progress
        print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")

        # Stop if fewer than full page — probably the last one
        if len(entries) < page_size:
            print("Reached last page of results.")
            break

        start += page_size
        time.sleep(4)  # Respect rate limit
    articles = np.unique(articles)
    results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")

    # Filter out any None results due to errors
    results = [result for result in results if result is not None]

    # Convert the list of dictionaries to a DataFrame
    arxiv = pd.DataFrame(results)
    arxiv.to_csv('data/raw/arxiv.csv')