Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import requests | |
import re | |
from tqdm import tqdm | |
from tqdm.contrib.concurrent import process_map | |
from multiprocessing import cpu_count | |
# --- Parse a single paper page --- | |
def process_paper(year, url): | |
try: | |
paper_page = requests.get(url).text | |
# Title | |
title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL) | |
title = title_match.group(1).strip() if title_match else "" | |
# Code repo link | |
code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL) | |
code_repo_link = code_repo_match.group(1).strip() if code_repo_match else "" | |
# Dataset info | |
dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL) | |
dataset_info = "Yes" if dataset_match else "No" | |
return { | |
"title": title, | |
"year": year, | |
"url": code_repo_link, | |
"public": dataset_info | |
} | |
except Exception as e: | |
print(f"Error processing {url}: {e}") | |
return None | |
# --- Main Execution --- | |
if __name__ == "__main__": | |
MICCAI_pages = [ | |
"https://miccai2021.org/openaccess/paperlinks/", | |
"https://conferences.miccai.org/2022/papers/", | |
"https://conferences.miccai.org/2023/papers/", | |
"https://papers.miccai.org/miccai-2024/" | |
] | |
MICCAI_root = [ | |
"https://miccai2021.org/openaccess/paperlinks/", | |
"https://conferences.miccai.org", | |
"https://conferences.miccai.org", | |
"https://papers.miccai.org" | |
] | |
years = [2021, 2022, 2023, 2024] | |
all_year_urls = [] | |
print("๐ Fetching paper URLs by year...") | |
for i in tqdm(range(len(MICCAI_pages)), desc="Years"): | |
try: | |
response = requests.get(MICCAI_pages[i]) | |
year_page = response.text | |
if years[i] == 2024: | |
matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page) | |
urls = [MICCAI_root[i] + match for match in matches] | |
else: | |
urls = [ | |
MICCAI_root[i] + line.split('href="')[1].split('"')[0] | |
for line in year_page.split('\n') | |
if "&bullet" in line and 'href="' in line | |
] | |
all_year_urls.extend([(years[i], url) for url in urls]) | |
except Exception as e: | |
print(f"Failed to fetch year {years[i]}: {e}") | |
print(f"๐ Total papers found: {len(all_year_urls)}") | |
# --- Parallel scrape each paper page --- | |
print("โ๏ธ Processing paper metadata...") | |
results = process_map( | |
process_paper, | |
[item[0] for item in all_year_urls], | |
[item[1] for item in all_year_urls], | |
max_workers=12, | |
chunksize=1, | |
desc="Parsing Papers" | |
) | |
results = [r for r in results if r is not None] | |
miccai = pd.DataFrame(results) | |
miccai.to_csv('data/raw/miccai.csv', index=False) | |
print("โ Saved to data/miccai.csv") | |