Attila Simkรณ
big upgrade
2db37b1
import pandas as pd
import requests
import re
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count
# --- Parse a single paper page ---
def process_paper(year, url):
try:
paper_page = requests.get(url).text
# Title
title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL)
title = title_match.group(1).strip() if title_match else ""
# Code repo link
code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL)
code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""
# Dataset info
dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL)
dataset_info = "Yes" if dataset_match else "No"
return {
"title": title,
"year": year,
"url": code_repo_link,
"public": dataset_info
}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
# --- Main Execution ---
if __name__ == "__main__":
MICCAI_pages = [
"https://miccai2021.org/openaccess/paperlinks/",
"https://conferences.miccai.org/2022/papers/",
"https://conferences.miccai.org/2023/papers/",
"https://papers.miccai.org/miccai-2024/"
]
MICCAI_root = [
"https://miccai2021.org/openaccess/paperlinks/",
"https://conferences.miccai.org",
"https://conferences.miccai.org",
"https://papers.miccai.org"
]
years = [2021, 2022, 2023, 2024]
all_year_urls = []
print("๐Ÿ” Fetching paper URLs by year...")
for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
try:
response = requests.get(MICCAI_pages[i])
year_page = response.text
if years[i] == 2024:
matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
urls = [MICCAI_root[i] + match for match in matches]
else:
urls = [
MICCAI_root[i] + line.split('href="')[1].split('"')[0]
for line in year_page.split('\n')
if "&bullet" in line and 'href="' in line
]
all_year_urls.extend([(years[i], url) for url in urls])
except Exception as e:
print(f"Failed to fetch year {years[i]}: {e}")
print(f"๐Ÿ“„ Total papers found: {len(all_year_urls)}")
# --- Parallel scrape each paper page ---
print("โš™๏ธ Processing paper metadata...")
results = process_map(
process_paper,
[item[0] for item in all_year_urls],
[item[1] for item in all_year_urls],
max_workers=12,
chunksize=1,
desc="Parsing Papers"
)
results = [r for r in results if r is not None]
miccai = pd.DataFrame(results)
miccai.to_csv('data/raw/miccai.csv', index=False)
print("โœ… Saved to data/miccai.csv")