Spaces:

attilasimko
/

reproduce

Sleeping

File size: 2,391 Bytes

77f290b

import pandas as pd
import requests
import re
from multiprocessing import Pool, cpu_count
from functools import partial

# Function to process each URL
def process_nature_paper(article_link):
    try:
        url = f'https://www.nature.com/articles/{article_link}'
        article_text = requests.get(url).text

        pattern = r'Code availability.*?<a href="([^"]+)"'
        matches = re.findall(pattern, article_text, re.DOTALL)
        urls = [link for link in matches if "github" in link]
        url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")

        year = re.findall(r'datetime="(\d{4})', article_text)[0]
        # # Find title
        title_pattern = r'<title>(.*?)\s*</title>'
        title = re.findall(title_pattern, article_text, re.DOTALL)[0]

        pattern = r'Data availability.*?<a href="([^"]+)"'
        matches = re.findall(pattern, article_text, re.DOTALL)
        dataset_info = "Yes" if (len(matches) > 0) else "No"

        # # Return a dictionary of the results
        return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

# Set debug mode
debug = False

# Fetch all URLs for each year
all_year_urls = []
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
articles = []
for search_query in search_queries:
    page = 1
    while (page <= 100):
        url = f"{search_query}{page}"
        current_page = requests.get(url).text
        pattern = r'href="/articles/([^"]+)"'
        matches = re.findall(pattern, current_page)
        if (len(matches) == 0):
            break
        else:
            page += 1

        articles += matches
articles = np.unique(articles)


# Parallel processing using Pool
if __name__ == "__main__":
    with Pool(processes=12) as pool:
        results = pool.starmap(process_nature_paper, [[article] for article in articles])

    # Filter out any None results due to errors
    results = [result for result in results if result is not None]

    # Convert the list of dictionaries to a DataFrame
    nature = pd.DataFrame(results)
    nature = nature[['title', 'year', 'pdf', 'url', 'public']]
    nature.to_csv('nature.csv')