reproduce / core /paper.py
Attila Simkó
new code
8dc0a07
# core/paper.py
import os
import uuid
import re
import fitz # PyMuPDF
import pdfplumber
import hashlib
import json
import pandas as pd
from config.constants import LogType
import ast
import streamlit as st
import datetime
from enum import Enum
def _parse_url_field(value):
if isinstance(value, list):
return value
if isinstance(value, str):
try:
parsed = ast.literal_eval(value)
return parsed if isinstance(parsed, list) else [parsed]
except Exception:
return [value]
if pd.isna(value):
return []
return [value]
class Paper:
def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], zip_path=None, verbose=0):
# Metadata
self.title = title
self.venue = venue
self.year = year
self.pdf_url = pdf_url
# Optional ground truth links (e.g., from curated metadata)
self.urls_manual = _parse_url_field(urls_manual)
self.urls_auto = _parse_url_field(urls_auto)
self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id
self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf"
self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml"
self.zip_path = zip_path or (None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip")
# Internal state
self.logs = [{"timestamp": datetime.datetime.utcnow().isoformat(), "level": LogType[log["type"].upper()], "message": log["message"]} for log in logs]
self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual
self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto
self.verbose = verbose
def __repr__(self):
return f"<Paper: {self.title}>"
@classmethod
def from_url(cls, code_url, verbose):
# Supports both dicts and pandas Series
return cls(
urls_manual=code_url,
zip_path="_temp.zip",
verbose=verbose
)
@classmethod
def from_raw(cls, row):
# Supports both dicts and pandas Series
return cls(
title=row.get("Title", ""),
venue=row.get("Venue", ""),
year=row.get("Year", ""),
pdf_url=row.get('PDF'),
urls_manual=row.get("Repository"),
code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")}
)
@classmethod
def from_row(cls, row):
# Supports both dicts and pandas Series
return cls(
title=row.get("title", ""),
venue=row.get("venue", ""),
year=row.get("year", ""),
paper_id=row.get('paper_id'),
pdf_url=row.get('pdf_url'),
urls_manual=json.loads(row.get("urls_manual")),
urls_auto=json.loads(row.get("urls_auto")),
code_repro_manual=json.loads(row.get("code_reproducibility_manual")),
code_repro_auto=json.loads(row.get("code_reproducibility_auto")),
logs=json.loads(row.get("logs", "[]"))
)
@property
def main_repo_url(self):
urls = [*self.urls_manual, *self.urls_auto]
github_links = [u for u in urls if "github.com" in u]
return github_links[0] if github_links else None
def _compute_id(self):
paper_name = self.title
if (not(pd.isna(self.pdf_url))):
paper_name += self.pdf_url
h = hashlib.sha256()
h.update(paper_name.encode("utf-8"))
return h.hexdigest()[:16]
def log(self, level, message):
self.logs.append({
"timestamp": datetime.datetime.utcnow().isoformat(),
"level": LogType[level.upper()], # "ERROR", "WARNING", "NOTE", etc.
"message": message
})
if (self.verbose == 0):
return
show_tips = (self.verbose == 2) | (self.verbose == 4)
if ((self.verbose == 1) | (self.verbose == 2)):
show = print
if ((self.verbose == 3) | (self.verbose == 4)):
show = st.write
# Align line-break
if (message.startswith("\n")):
show("\n")
message = message.lstrip('\n')
# Only show tips in verbose mode 2 and 4
if ((level == "TITLE") & show_tips):
show(f"\n#### {message}")
if ((level == "TIP") & show_tips):
show(f"*{message}*")
if ((level == "LOG") & show_tips):
show(f"{message}")
if ((level == "ERROR")):
show(f"**{message}**")
if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")):
raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.")
def to_dict(self):
return {
"title": self.title,
"venue": self.venue,
"year": self.year,
"pdf_url": self.pdf_url,
"paper_id": self.paper_id,
"urls_auto": json.dumps(self.urls_auto),
"urls_manual": json.dumps(self.urls_manual),
"logs": json.dumps([
{"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]}
for log in self.logs
]),
"code_reproducibility_manual": json.dumps(self.code_repro_manual),
"code_reproducibility_auto": json.dumps(self.code_repro_auto),
}