import pandas as pd import requests import re from tqdm import tqdm from tqdm.contrib.concurrent import process_map from multiprocessing import cpu_count # --- Parse a single paper page --- def process_paper(year, url): try: paper_page = requests.get(url).text # Title title_match = re.search(r'(.*?)\s*', paper_page, re.DOTALL) title = title_match.group(1).strip() if title_match else "" # Code repo link code_repo_match = re.search(r'

.*?

\s*

', paper_page, re.DOTALL) code_repo_link = code_repo_match.group(1).strip() if code_repo_match else "" # Dataset info dataset_match = re.search(r'

.*?

\s*

(.*?)\s*
', paper_page, re.DOTALL) dataset_info = "Yes" if dataset_match else "No" return { "title": title, "year": year, "url": code_repo_link, "public": dataset_info } except Exception as e: print(f"Error processing {url}: {e}") return None # --- Main Execution --- if __name__ == "__main__": MICCAI_pages = [ "https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/", "https://papers.miccai.org/miccai-2024/" ] MICCAI_root = [ "https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org", "https://papers.miccai.org" ] years = [2021, 2022, 2023, 2024] all_year_urls = [] print("🔍 Fetching paper URLs by year...") for i in tqdm(range(len(MICCAI_pages)), desc="Years"): try: response = requests.get(MICCAI_pages[i]) year_page = response.text if years[i] == 2024: matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page) urls = [MICCAI_root[i] + match for match in matches] else: urls = [ MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line and 'href="' in line ] all_year_urls.extend([(years[i], url) for url in urls]) except Exception as e: print(f"Failed to fetch year {years[i]}: {e}") print(f"📄 Total papers found: {len(all_year_urls)}") # --- Parallel scrape each paper page --- print("⚙️ Processing paper metadata...") results = process_map( process_paper, [item[0] for item in all_year_urls], [item[1] for item in all_year_urls], max_workers=12, chunksize=1, desc="Parsing Papers" ) results = [r for r in results if r is not None] miccai = pd.DataFrame(results) miccai.to_csv('data/raw/miccai.csv', index=False) print("✅ Saved to data/miccai.csv")