', paper_page, re.DOTALL)
code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""
# Dataset info
dataset_match = re.search(r'.*?
\s*(.*?)\s*
', paper_page, re.DOTALL)
dataset_info = "Yes" if dataset_match else "No"
return {
"title": title,
"year": year,
"url": code_repo_link,
"public": dataset_info
}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
# --- Main Execution ---
if __name__ == "__main__":
MICCAI_pages = [
"https://miccai2021.org/openaccess/paperlinks/",
"https://conferences.miccai.org/2022/papers/",
"https://conferences.miccai.org/2023/papers/",
"https://papers.miccai.org/miccai-2024/"
]
MICCAI_root = [
"https://miccai2021.org/openaccess/paperlinks/",
"https://conferences.miccai.org",
"https://conferences.miccai.org",
"https://papers.miccai.org"
]
years = [2021, 2022, 2023, 2024]
all_year_urls = []
print("🔍 Fetching paper URLs by year...")
for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
try:
response = requests.get(MICCAI_pages[i])
year_page = response.text
if years[i] == 2024:
matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
urls = [MICCAI_root[i] + match for match in matches]
else:
urls = [
MICCAI_root[i] + line.split('href="')[1].split('"')[0]
for line in year_page.split('\n')
if "&bullet" in line and 'href="' in line
]
all_year_urls.extend([(years[i], url) for url in urls])
except Exception as e:
print(f"Failed to fetch year {years[i]}: {e}")
print(f"📄 Total papers found: {len(all_year_urls)}")
# --- Parallel scrape each paper page ---
print("⚙️ Processing paper metadata...")
results = process_map(
process_paper,
[item[0] for item in all_year_urls],
[item[1] for item in all_year_urls],
max_workers=12,
chunksize=1,
desc="Parsing Papers"
)
results = [r for r in results if r is not None]
miccai = pd.DataFrame(results)
miccai.to_csv('data/raw/miccai.csv', index=False)
print("✅ Saved to data/miccai.csv")