Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,257 Bytes
2db37b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import pandas as pd
import requests
import pdfplumber
import numpy as np
import re
from multiprocessing import Pool, cpu_count
from functools import partial
import urllib, urllib.request
import os
import sys
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map # better for multiprocessing
import feedparser
import time
from datetime import datetime
from tqdm import tqdm
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(ROOT_DIR)
# Function to process each URL
def process_arxiv_paper(article_link):
try:
article_text = requests.get(article_link).text
title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
title = re.findall(title_pattern, article_text, re.DOTALL)[0]
year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]'
year = re.findall(year_pattern, article_text)[0].split(" ")[-1]
article_id = article_link.split("/")[-1]
pdf_url = f'https://arxiv.org/pdf/{article_id}'
urls = [] # fetch_url(pdf_url)
return {"title": title, "year": year, "pdf": pdf_url, "url": urls}
except Exception as e:
print(f"Error processing {article_link}: {e}")
return None
page_size = 100
base_query = "http://export.arxiv.org/api/query"
query_params = "search_query=all:(deep+learning)+AND+all:cancer&max_results=100"
articles = []
start = 0
max_empty_pages = 3 # stop early if we hit consecutive empty pages
empty_pages = 0
# Parallel processing using Pool
if __name__ == "__main__":
print("Fetching arXiv article URLs...")
while True:
# Build URL with pagination
url = f"{base_query}?{query_params}&start={start}"
# Parse the Atom feed
feed = feedparser.parse(url)
entries = feed.entries
if not entries:
empty_pages += 1
print(f"⚠️ Empty page at start={start}. Consecutive empty pages: {empty_pages}")
if empty_pages >= max_empty_pages:
print("Stopping early due to repeated empty results.")
break
time.sleep(4)
start += page_size
continue
empty_pages = 0 # reset empty count on success
for entry in entries:
pub_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
if pub_date >= datetime(2018, 1, 1):
articles.append(entry.link)
# Log progress
print(f"✅ Fetched {len(entries)} entries at start={start}. Total collected: {len(articles)}")
# Stop if fewer than full page — probably the last one
if len(entries) < page_size:
print("Reached last page of results.")
break
start += page_size
time.sleep(4) # Respect rate limit
articles = np.unique(articles)
results = process_map(process_arxiv_paper, articles, max_workers=6, chunksize=1, desc="Processing Articles")
# Filter out any None results due to errors
results = [result for result in results if result is not None]
# Convert the list of dictionaries to a DataFrame
arxiv = pd.DataFrame(results)
arxiv.to_csv('data/raw/arxiv.csv')
|