Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

reproduce / data_generation /paper_scraping /fetch_miccai.py

Attila Simkó

big upgrade

2db37b1 8 days ago

3.04 kB

	import pandas as pd
	import requests
	import re
	from tqdm import tqdm
	from tqdm.contrib.concurrent import process_map
	from multiprocessing import cpu_count

	# --- Parse a single paper page ---
	def process_paper(year, url):
	try:
	paper_page = requests.get(url).text

	# Title
	title_match = re.search(r'<title>(.?)\s</title>', paper_page, re.DOTALL)
	title = title_match.group(1).strip() if title_match else ""

	# Code repo link
	code_repo_match = re.search(r'<h1 id="code-id">.?</h1>\s<p><a href="(.*?)">', paper_page, re.DOTALL)
	code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""

	# Dataset info
	dataset_match = re.search(r'<h1 id="dataset-id">.?</h1>\s<p>(.?)\s<br />', paper_page, re.DOTALL)
	dataset_info = "Yes" if dataset_match else "No"

	return {
	"title": title,
	"year": year,
	"url": code_repo_link,
	"public": dataset_info
	}

	except Exception as e:
	print(f"Error processing {url}: {e}")
	return None

	# --- Main Execution ---
	if __name__ == "__main__":
	MICCAI_pages = [
	"https://miccai2021.org/openaccess/paperlinks/",
	"https://conferences.miccai.org/2022/papers/",
	"https://conferences.miccai.org/2023/papers/",
	"https://papers.miccai.org/miccai-2024/"
	]
	MICCAI_root = [
	"https://miccai2021.org/openaccess/paperlinks/",
	"https://conferences.miccai.org",
	"https://conferences.miccai.org",
	"https://papers.miccai.org"
	]
	years = [2021, 2022, 2023, 2024]

	all_year_urls = []

	print("🔍 Fetching paper URLs by year...")
	for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
	try:
	response = requests.get(MICCAI_pages[i])
	year_page = response.text
	if years[i] == 2024:
	matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
	urls = [MICCAI_root[i] + match for match in matches]
	else:
	urls = [
	MICCAI_root[i] + line.split('href="')[1].split('"')[0]
	for line in year_page.split('\n')
	if "&bullet" in line and 'href="' in line
	]
	all_year_urls.extend([(years[i], url) for url in urls])
	except Exception as e:
	print(f"Failed to fetch year {years[i]}: {e}")

	print(f"📄 Total papers found: {len(all_year_urls)}")

	# --- Parallel scrape each paper page ---
	print("⚙️ Processing paper metadata...")
	results = process_map(
	process_paper,
	[item[0] for item in all_year_urls],
	[item[1] for item in all_year_urls],
	max_workers=12,
	chunksize=1,
	desc="Parsing Papers"
	)

	results = [r for r in results if r is not None]

	miccai = pd.DataFrame(results)
	miccai.to_csv('data/raw/miccai.csv', index=False)
	print("✅ Saved to data/miccai.csv")