import pandas as pd import requests import os import re from multiprocessing import Pool, cpu_count import numpy as np from tqdm import tqdm from tqdm.contrib.concurrent import process_map # better for multiprocessing # Function to process each URL def process_nature_paper(article_link): try: pdf_url = f'https://www.nature.com/articles/{article_link}' article_text = requests.get(pdf_url).text pattern = r'Code availability.*? 0 else (matches[0] if len(matches) > 0 else "") year = re.findall(r'datetime="(\d{4})', article_text)[0] # # Find title title_pattern = r'(.*?)\s*' title = re.findall(title_pattern, article_text, re.DOTALL)[0] pattern = r'Data availability.*? 0) else "No" # # Return a dictionary of the results return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info} except Exception as e: print(f"Error processing {pdf_url}: {e}") return None # Set debug mode debug = False # Fetch all URLs for each year all_year_urls = [] search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="] articles = [] for search_query in tqdm(search_queries, desc="Search Queries"): page = 1 while page <= 100: url = f"{search_query}{page}" current_page = requests.get(url).text pattern = r'href="/articles/([^"]+)"' matches = re.findall(pattern, current_page) if not matches: break else: page += 1 articles += matches articles = np.unique(articles) # Parallel processing using Pool if __name__ == "__main__": results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles") results = [result for result in results if result is not None] nature = pd.DataFrame(results) nature = nature[['title', 'year', 'pdf', 'url', 'public']] nature.to_csv('data/raw/nature.csv')