import pandas as pd import requests import re from multiprocessing import Pool, cpu_count from functools import partial # Function to process each URL def process_nature_paper(article_link): try: url = f'https://www.nature.com/articles/{article_link}' article_text = requests.get(url).text pattern = r'Code availability.*? 0 else (matches[0] if len(matches) > 0 else "") year = re.findall(r'datetime="(\d{4})', article_text)[0] # # Find title title_pattern = r'(.*?)\s*' title = re.findall(title_pattern, article_text, re.DOTALL)[0] pattern = r'Data availability.*? 0) else "No" # # Return a dictionary of the results return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""} except Exception as e: print(f"Error processing {url}: {e}") return None # Set debug mode debug = False # Fetch all URLs for each year all_year_urls = [] search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="] articles = [] for search_query in search_queries: page = 1 while (page <= 100): url = f"{search_query}{page}" current_page = requests.get(url).text pattern = r'href="/articles/([^"]+)"' matches = re.findall(pattern, current_page) if (len(matches) == 0): break else: page += 1 articles += matches articles = np.unique(articles) # Parallel processing using Pool if __name__ == "__main__": with Pool(processes=12) as pool: results = pool.starmap(process_nature_paper, [[article] for article in articles]) # Filter out any None results due to errors results = [result for result in results if result is not None] # Convert the list of dictionaries to a DataFrame nature = pd.DataFrame(results) nature = nature[['title', 'year', 'pdf', 'url', 'public']] nature.to_csv('nature.csv')