# core/paper.py import os import uuid import re import fitz # PyMuPDF import pdfplumber import hashlib import json import pandas as pd from config.constants import LogType import ast import streamlit as st import datetime from enum import Enum def _parse_url_field(value): if isinstance(value, list): return value if isinstance(value, str): try: parsed = ast.literal_eval(value) return parsed if isinstance(parsed, list) else [parsed] except Exception: return [value] if pd.isna(value): return [] return [value] class Paper: def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], zip_path=None, verbose=0): # Metadata self.title = title self.venue = venue self.year = year self.pdf_url = pdf_url # Optional ground truth links (e.g., from curated metadata) self.urls_manual = _parse_url_field(urls_manual) self.urls_auto = _parse_url_field(urls_auto) self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf" self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml" self.zip_path = zip_path or (None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip") # Internal state self.logs = [{"timestamp": datetime.datetime.utcnow().isoformat(), "level": LogType[log["type"].upper()], "message": log["message"]} for log in logs] self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto self.verbose = verbose def __repr__(self): return f"" @classmethod def from_url(cls, code_url, verbose): # Supports both dicts and pandas Series return cls( urls_manual=code_url, zip_path="_temp.zip", verbose=verbose ) @classmethod def from_raw(cls, row): # Supports both dicts and pandas Series return cls( title=row.get("Title", ""), venue=row.get("Venue", ""), year=row.get("Year", ""), pdf_url=row.get('PDF'), urls_manual=row.get("Repository"), code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")} ) @classmethod def from_row(cls, row): # Supports both dicts and pandas Series return cls( title=row.get("title", ""), venue=row.get("venue", ""), year=row.get("year", ""), paper_id=row.get('paper_id'), pdf_url=row.get('pdf_url'), urls_manual=json.loads(row.get("urls_manual")), urls_auto=json.loads(row.get("urls_auto")), code_repro_manual=json.loads(row.get("code_reproducibility_manual")), code_repro_auto=json.loads(row.get("code_reproducibility_auto")), logs=json.loads(row.get("logs", "[]")) ) @property def main_repo_url(self): urls = [*self.urls_manual, *self.urls_auto] github_links = [u for u in urls if "github.com" in u] return github_links[0] if github_links else None def _compute_id(self): paper_name = self.title if (not(pd.isna(self.pdf_url))): paper_name += self.pdf_url h = hashlib.sha256() h.update(paper_name.encode("utf-8")) return h.hexdigest()[:16] def log(self, level, message): self.logs.append({ "timestamp": datetime.datetime.utcnow().isoformat(), "level": LogType[level.upper()], # "ERROR", "WARNING", "NOTE", etc. "message": message }) if (self.verbose == 0): return show_tips = (self.verbose == 2) | (self.verbose == 4) if ((self.verbose == 1) | (self.verbose == 2)): show = print if ((self.verbose == 3) | (self.verbose == 4)): show = st.write # Align line-break if (message.startswith("\n")): show("\n") message = message.lstrip('\n') # Only show tips in verbose mode 2 and 4 if ((level == "TITLE") & show_tips): show(f"\n#### {message}") if ((level == "TIP") & show_tips): show(f"*{message}*") if ((level == "LOG") & show_tips): show(f"{message}") if ((level == "ERROR")): show(f"**{message}**") if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")): raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.") def to_dict(self): return { "title": self.title, "venue": self.venue, "year": self.year, "pdf_url": self.pdf_url, "paper_id": self.paper_id, "urls_auto": json.dumps(self.urls_auto), "urls_manual": json.dumps(self.urls_manual), "logs": json.dumps([ {"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]} for log in self.logs ]), "code_reproducibility_manual": json.dumps(self.code_repro_manual), "code_reproducibility_auto": json.dumps(self.code_repro_auto), }