import pandas as pd import requests import pdfplumber import re from multiprocessing import Pool, cpu_count from functools import partial import os # Function to process each URL def process_arxiv_paper(article_link): try: article_text = requests.get(article_link).text title_pattern = r'

Title:(.*?)

' title = re.findall(title_pattern, article_text, re.DOTALL)[0] year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]' year = re.findall(year_pattern, article_text)[0].split(" ")[-1] article_id = article_link.split("/")[-1] pdf_url = f'https://arxiv.org/pdf/{article_id}' response = requests.get(pdf_url) if response.status_code == 200: with open(f"{article_id}.pdf", 'wb') as file: file.write(response.content) if (response.status_code == 404): print("Failed to fetch pdf") return None urls = [] link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)' with pdfplumber.open(f"{article_id}.pdf") as pdf: # Loop through all pages for page_num, page in enumerate(pdf.pages): # Extract text from the page text = page.extract_text() # Search for a specific word or phrase found_urls = re.findall(link_pattern, text) urls.extend(found_urls) os.remove(f"{article_id}.pdf") urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)] print(urls) url = urls[0] if len(urls) > 0 else "" # Return a dictionary of the results return {"venue": "arXiv", "title": title, "url": url, "year": year} except Exception as e: print(f"Error processing {article_link}: {e}") return None # Set debug mode debug = False # Fetch all URLs for each year all_year_urls = [] page_size = 50 search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start='] articles = [] for search_query in search_queries: page = 0 while (page <= 100): start_idx = page_size * page url = f"{search_query}{start_idx}" current_page = requests.get(url).text pattern = r'

.*?