.*?

import pandas as pd
import requests
import re
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count

# --- Parse a single paper page ---
def process_paper(year, url):
    try:
        paper_page = requests.get(url).text

        # Title
        title_match = re.search(r'<title>(.*?)\s*</title>', paper_page, re.DOTALL)
        title = title_match.group(1).strip() if title_match else ""

        # Code repo link
        code_repo_match = re.search(r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">', paper_page, re.DOTALL)
        code_repo_link = code_repo_match.group(1).strip() if code_repo_match else ""

        # Dataset info
        dataset_match = re.search(r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />', paper_page, re.DOTALL)
        dataset_info = "Yes" if dataset_match else "No"

        return {
            "title": title,
            "year": year,
            "url": code_repo_link,
            "public": dataset_info
        }

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

# --- Main Execution ---
if __name__ == "__main__":
    MICCAI_pages = [
        "https://miccai2021.org/openaccess/paperlinks/",
        "https://conferences.miccai.org/2022/papers/",
        "https://conferences.miccai.org/2023/papers/",
        "https://papers.miccai.org/miccai-2024/"
    ]
    MICCAI_root = [
        "https://miccai2021.org/openaccess/paperlinks/",
        "https://conferences.miccai.org",
        "https://conferences.miccai.org",
        "https://papers.miccai.org"
    ]
    years = [2021, 2022, 2023, 2024]

    all_year_urls = []

    print("🔍 Fetching paper URLs by year...")
    for i in tqdm(range(len(MICCAI_pages)), desc="Years"):
        try:
            response = requests.get(MICCAI_pages[i])
            year_page = response.text
            if years[i] == 2024:
                matches = re.findall(r'href="(/miccai-2024/\d{3}-Paper\d+\.html)"', year_page)
                urls = [MICCAI_root[i] + match for match in matches]
            else:
                urls = [
                    MICCAI_root[i] + line.split('href="')[1].split('"')[0]
                    for line in year_page.split('\n')
                    if "&bullet" in line and 'href="' in line
                ]
            all_year_urls.extend([(years[i], url) for url in urls])
        except Exception as e:
            print(f"Failed to fetch year {years[i]}: {e}")

    print(f"📄 Total papers found: {len(all_year_urls)}")

    # --- Parallel scrape each paper page ---
    print("⚙️ Processing paper metadata...")
    results = process_map(
        process_paper,
        [item[0] for item in all_year_urls],
        [item[1] for item in all_year_urls],
        max_workers=12,
        chunksize=1,
        desc="Parsing Papers"
    )

    results = [r for r in results if r is not None]

    miccai = pd.DataFrame(results)
    miccai.to_csv('data/raw/miccai.csv', index=False)
    print("✅ Saved to data/miccai.csv")