Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import requests | |
import os | |
import re | |
from multiprocessing import Pool, cpu_count | |
import numpy as np | |
from tqdm import tqdm | |
from tqdm.contrib.concurrent import process_map # better for multiprocessing | |
# Function to process each URL | |
def process_nature_paper(article_link): | |
try: | |
pdf_url = f'https://www.nature.com/articles/{article_link}' | |
article_text = requests.get(pdf_url).text | |
pattern = r'Code availability.*?<a href="([^"]+)"' | |
matches = re.findall(pattern, article_text, re.DOTALL) | |
urls = [link for link in matches if "github" in link] | |
# url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "") | |
year = re.findall(r'datetime="(\d{4})', article_text)[0] | |
# # Find title | |
title_pattern = r'<title>(.*?)\s*</title>' | |
title = re.findall(title_pattern, article_text, re.DOTALL)[0] | |
pattern = r'Data availability.*?<a href="([^"]+)"' | |
matches = re.findall(pattern, article_text, re.DOTALL) | |
dataset_info = "Yes" if (len(matches) > 0) else "No" | |
# # Return a dictionary of the results | |
return {"title": title, "year": year, "pdf": pdf_url + ".pdf", "url": urls, "public": dataset_info} | |
except Exception as e: | |
print(f"Error processing {pdf_url}: {e}") | |
return None | |
# Set debug mode | |
debug = False | |
# Fetch all URLs for each year | |
all_year_urls = [] | |
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&article_type=research&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&article_type=research&journal=commsmed%2Cnm&page="] | |
articles = [] | |
for search_query in tqdm(search_queries, desc="Search Queries"): | |
page = 1 | |
while page <= 100: | |
url = f"{search_query}{page}" | |
current_page = requests.get(url).text | |
pattern = r'href="/articles/([^"]+)"' | |
matches = re.findall(pattern, current_page) | |
if not matches: | |
break | |
else: | |
page += 1 | |
articles += matches | |
articles = np.unique(articles) | |
# Parallel processing using Pool | |
if __name__ == "__main__": | |
results = process_map(process_nature_paper, articles, max_workers=12, chunksize=1, desc="Processing Articles") | |
results = [result for result in results if result is not None] | |
nature = pd.DataFrame(results) | |
nature = nature[['title', 'year', 'pdf', 'url', 'public']] | |
nature.to_csv('data/raw/nature.csv') |