Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# core/paper.py | |
import os | |
import uuid | |
import re | |
import fitz # PyMuPDF | |
import pdfplumber | |
import hashlib | |
import json | |
import pandas as pd | |
from config.constants import LogType | |
import ast | |
import streamlit as st | |
import datetime | |
from enum import Enum | |
def _parse_url_field(value): | |
if isinstance(value, list): | |
return value | |
if isinstance(value, str): | |
try: | |
parsed = ast.literal_eval(value) | |
return parsed if isinstance(parsed, list) else [parsed] | |
except Exception: | |
return [value] | |
if pd.isna(value): | |
return [] | |
return [value] | |
class Paper: | |
def __init__(self, title="", venue="", year="", paper_id=None, pdf_url=None, urls_manual=None, urls_auto=None, code_repro_manual=None, code_repro_auto=None, logs=[], zip_path=None, verbose=0): | |
# Metadata | |
self.title = title | |
self.venue = venue | |
self.year = year | |
self.pdf_url = pdf_url | |
# Optional ground truth links (e.g., from curated metadata) | |
self.urls_manual = _parse_url_field(urls_manual) | |
self.urls_auto = _parse_url_field(urls_auto) | |
self.paper_id = self._compute_id() if pd.isna(paper_id) else paper_id | |
self.pdf_path = None if (pd.isna(pdf_url)) else "data/papers/" + self.paper_id + ".pdf" | |
self.xml_path = None if (pd.isna(pdf_url)) else "data/xml/" + self.paper_id + ".xml" | |
self.zip_path = zip_path or (None if (pd.isna(self.main_repo_url)) else "data/test/" + self.paper_id + ".zip") | |
# Internal state | |
self.logs = [{"timestamp": datetime.datetime.utcnow().isoformat(), "level": LogType[log["type"].upper()], "message": log["message"]} for log in logs] | |
self.code_repro_manual = dict() if pd.isna(code_repro_manual) else code_repro_manual | |
self.code_repro_auto = dict() if pd.isna(code_repro_auto) else code_repro_auto | |
self.verbose = verbose | |
def __repr__(self): | |
return f"<Paper: {self.title}>" | |
def from_url(cls, code_url, verbose): | |
# Supports both dicts and pandas Series | |
return cls( | |
urls_manual=code_url, | |
zip_path="_temp.zip", | |
verbose=verbose | |
) | |
def from_raw(cls, row): | |
# Supports both dicts and pandas Series | |
return cls( | |
title=row.get("Title", ""), | |
venue=row.get("Venue", ""), | |
year=row.get("Year", ""), | |
pdf_url=row.get('PDF'), | |
urls_manual=row.get("Repository"), | |
code_repro_manual={"public": row.get("Data Public"), "dependencies": row.get("Dependencies"), "training": row.get("Training code"), "evaluation": row.get("Evaluation code"), "weights": row.get("Pre-trained models"), "readme": row.get("README file"), "license": row.get("Licensing")} | |
) | |
def from_row(cls, row): | |
# Supports both dicts and pandas Series | |
return cls( | |
title=row.get("title", ""), | |
venue=row.get("venue", ""), | |
year=row.get("year", ""), | |
paper_id=row.get('paper_id'), | |
pdf_url=row.get('pdf_url'), | |
urls_manual=json.loads(row.get("urls_manual")), | |
urls_auto=json.loads(row.get("urls_auto")), | |
code_repro_manual=json.loads(row.get("code_reproducibility_manual")), | |
code_repro_auto=json.loads(row.get("code_reproducibility_auto")), | |
logs=json.loads(row.get("logs", "[]")) | |
) | |
def main_repo_url(self): | |
urls = [*self.urls_manual, *self.urls_auto] | |
github_links = [u for u in urls if "github.com" in u] | |
return github_links[0] if github_links else None | |
def _compute_id(self): | |
paper_name = self.title | |
if (not(pd.isna(self.pdf_url))): | |
paper_name += self.pdf_url | |
h = hashlib.sha256() | |
h.update(paper_name.encode("utf-8")) | |
return h.hexdigest()[:16] | |
def log(self, level, message): | |
self.logs.append({ | |
"timestamp": datetime.datetime.utcnow().isoformat(), | |
"level": LogType[level.upper()], # "ERROR", "WARNING", "NOTE", etc. | |
"message": message | |
}) | |
if (self.verbose == 0): | |
return | |
show_tips = (self.verbose == 2) | (self.verbose == 4) | |
if ((self.verbose == 1) | (self.verbose == 2)): | |
show = print | |
if ((self.verbose == 3) | (self.verbose == 4)): | |
show = st.write | |
# Align line-break | |
if (message.startswith("\n")): | |
show("\n") | |
message = message.lstrip('\n') | |
# Only show tips in verbose mode 2 and 4 | |
if ((level == "TITLE") & show_tips): | |
show(f"\n#### {message}") | |
if ((level == "TIP") & show_tips): | |
show(f"*{message}*") | |
if ((level == "LOG") & show_tips): | |
show(f"{message}") | |
if ((level == "ERROR")): | |
show(f"**{message}**") | |
if ((level != "TIP") & (level != "LOG") & (level != "ERROR") & (level != "TITLE")): | |
raise ValueError("Invalid log type. Use 'TIP', 'LOG', 'TITLE' or 'ERROR'.") | |
def to_dict(self): | |
return { | |
"title": self.title, | |
"venue": self.venue, | |
"year": self.year, | |
"pdf_url": self.pdf_url, | |
"paper_id": self.paper_id, | |
"urls_auto": json.dumps(self.urls_auto), | |
"urls_manual": json.dumps(self.urls_manual), | |
"logs": json.dumps([ | |
{"type": log["level"].value if isinstance(log["level"], Enum) else log["level"], "message": log["message"]} | |
for log in self.logs | |
]), | |
"code_reproducibility_manual": json.dumps(self.code_repro_manual), | |
"code_reproducibility_auto": json.dumps(self.code_repro_auto), | |
} |