import pandas as pd
import requests
import os
import re
from multiprocessing import Pool, cpu_count
import numpy as np
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map # better for multiprocessing
# Function to process each URL
def process_nature_paper(article_link):
try:
pdf_url = f'https://www.nature.com/articles/{article_link}'
article_text = requests.get(pdf_url).text
pattern = r'Code availability.*? 0 else (matches[0] if len(matches) > 0 else "")
year = re.findall(r'datetime="(\d{4})', article_text)[0]
# # Find title
title_pattern = r'(.*?)\s*'
title = re.findall(title_pattern, article_text, re.DOTALL)[0]
pattern = r'Data availability.*? 0) else "No"
# # Return a dictionary of the results
return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info}
except Exception as e:
print(f"Error processing {pdf_url}: {e}")
return None
# Set debug mode
debug = False
# Fetch all URLs for each year
all_year_urls = []
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="]
articles = []
for search_query in tqdm(search_queries, desc="Search Queries"):
page = 1
while page <= 100:
url = f"{search_query}{page}"
current_page = requests.get(url).text
pattern = r'href="/articles/([^"]+)"'
matches = re.findall(pattern, current_page)
if not matches:
break
else:
page += 1
articles += matches
articles = np.unique(articles)
# Parallel processing using Pool
if __name__ == "__main__":
results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles")
results = [result for result in results if result is not None]
nature = pd.DataFrame(results)
nature = nature[['title', 'year', 'pdf', 'url', 'public']]
nature.to_csv('data/raw/nature.csv')