ScientryBackend / main.py
raannakasturi's picture
Refactor summarization and email sending logic; improve error handling and environment variable checks
f50b29d
raw
history blame
4.81 kB
import json
import os
import time
import dotenv
import html
from summarize_paper import summarize_paper
from fetch_data import fetch_paper_data_with_category
from post_blog import post_blog
from send_mail import send_email
dotenv.load_dotenv()
ACCESS_KEY = os.getenv("ACCESS_KEY")
def fix_text(text: str) -> str:
text = html.escape(text.encode('utf-8').decode('utf-8').replace("â¦", "..., "))
fixed_text = ""
for word in text.split():
try:
fixed_text += word.encode('latin1').decode('utf-8') + " "
except Exception:
fixed_text += word + " "
return fixed_text.encode('utf-8').decode()
def paper_data(paper_data_json: str, wait_time: int = 5) -> str:
result_data = {"status": "success", "data": {}}
papers_by_category = json.loads(paper_data_json)
for category, papers in papers_by_category.items():
print(f"Processing category: {category}")
result_data["data"][category] = {}
for paper_id, details in papers.items():
doi = details.get("doi")
pdf_url = details.get("pdf_url")
title = details.get("title")
title = html.escape(title) if title else ""
citation = details.get("citation")
if not all([paper_id, doi, pdf_url, title, citation]):
print(f"Skipping paper with ID: {paper_id} (missing details)")
continue
summary, mindmap = None, None
max_retries = 3
retry_count = 0
while (not summary or not mindmap) and retry_count < max_retries:
try:
summary, mindmap = summarize_paper(pdf_url, paper_id, ACCESS_KEY)
if summary and mindmap:
break
except Exception as e:
print(f"Error summarizing paper {paper_id}: {e}")
retry_count += 1
if retry_count < max_retries:
print(f"Retrying paper {paper_id} in 3 minutes")
time.sleep(3 * 60)
if not summary or not mindmap:
print(f"Failed to summarize paper {paper_id} after {max_retries} attempts")
continue
try:
fixed_title = html.escape(fix_text(title).strip())
fixed_citation = html.escape(fix_text(citation).strip())
fixed_summary = html.escape(str(summary).strip())
fixed_mindmap = html.escape(str(mindmap).strip())
post_status = post_blog(doi, fixed_title, category, fixed_summary, fixed_mindmap, fixed_citation, ACCESS_KEY, wait_time)
except Exception as e:
print(f"Error posting blog '{title}': {e}")
continue
result_data["data"][category][paper_id] = {
"id": paper_id,
"doi": doi,
"title": fixed_title,
"category": category,
"posted": post_status,
"citation": fixed_citation,
"summary": fixed_summary,
"mindmap": fixed_mindmap,
}
return json.dumps(result_data, indent=4, ensure_ascii=False)
def post_blogpost(uaccess_key: str, wait_time: int = 5) -> str:
if uaccess_key != ACCESS_KEY:
return False
data = fetch_paper_data_with_category(uaccess_key)
processed_data = paper_data(data, wait_time)
try:
send_email(processed_data)
print("\n-------------------------------------------------------\nMail Sent\n-------------------------------------------------------\n")
except Exception as e:
print(f"\n-------------------------------------------------------\nError sending mail: {e}\n-------------------------------------------------------\n")
finally:
print("\n-------------------------------------------------------\nProcess Completed\n-------------------------------------------------------\n")
return processed_data
def test(uaccess_key: str) -> str:
if uaccess_key != ACCESS_KEY:
return False
test_data = {
"Economics": {
"2501.00578": {
"paper_id": "2501.00578",
"doi": "https://doi.org/10.1002/alz.14328",
"title": "Bound-State Beta Decay of $\\mathbf{\\mathrm{^{205}{Tl}^{81+}}}$ Ions and the LOREX Project",
"category": "Economics",
"pdf_url": "https://arxiv.org/pdf/2501.00578",
"citation": "Miller, A. D. (2025). The limits of tolerance (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00578",
}
}
}
return paper_data(json.dumps(test_data, ensure_ascii=False, indent=4))
if __name__ == '__main__':
result = test(ACCESS_KEY)
print(result)