raannakasturi commited on
Commit
98434dd
·
1 Parent(s): 5bb5491

Refactor paper summarization to improve text handling and remove unused parameters

Browse files
Files changed (2) hide show
  1. main.py +16 -9
  2. summarize_paper.py +7 -11
main.py CHANGED
@@ -10,6 +10,16 @@ from send_mail import send_email
10
  dotenv.load_dotenv()
11
  access_key = os.getenv("ACCESS_KEY")
12
 
 
 
 
 
 
 
 
 
 
 
13
  def paper_data(paper_data, wait_time=5):
14
  data = {"status": "success"}
15
  data['data'] = {}
@@ -21,23 +31,20 @@ def paper_data(paper_data, wait_time=5):
21
  doi = details.get("doi")
22
  pdf_url = details.get("pdf_url")
23
  title = details.get("title")
 
24
  citation = details.get("citation")
25
  if not all([paper_id, doi, pdf_url, title, citation]):
26
  print(f"Skipping paper with ID: {paper_id} (missing details)")
27
  continue
28
- fixed_title, summary, mindmap, fixed_citation = summarize_paper(title, pdf_url, paper_id, citation, access_key)
29
- if not fixed_title:
30
- title = title
31
- else:
32
- title = fixed_title
33
- if not fixed_citation:
34
- citation = citation
35
- else:
36
- citation = fixed_citation
37
  if not summary or not mindmap:
38
  print(f"Skipping paper with ID: {paper_id} (Summary/Mindmap not found)")
39
  continue
40
  try:
 
 
 
 
41
  title = html.escape(str(title).strip())
42
  citation = html.escape(str(citation).strip())
43
  status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
 
10
  dotenv.load_dotenv()
11
  access_key = os.getenv("ACCESS_KEY")
12
 
13
+ def fix_text(text):
14
+ text = html.escape(text.encode('utf-8').decode('utf-8').replace("â¦", "..., "))
15
+ fixed_text = ""
16
+ for word in text.split():
17
+ try:
18
+ fixed_text += word.encode('latin1').decode('utf-8')+" "
19
+ except:
20
+ fixed_text += word+" "
21
+ return fixed_text.encode('utf-8').decode().replace('\\', '\\\\')
22
+
23
  def paper_data(paper_data, wait_time=5):
24
  data = {"status": "success"}
25
  data['data'] = {}
 
31
  doi = details.get("doi")
32
  pdf_url = details.get("pdf_url")
33
  title = details.get("title")
34
+ title = html.escape(title.encode('unicode-escape').decode().replace('\\\\', '\\'))
35
  citation = details.get("citation")
36
  if not all([paper_id, doi, pdf_url, title, citation]):
37
  print(f"Skipping paper with ID: {paper_id} (missing details)")
38
  continue
39
+ summary, mindmap = summarize_paper(pdf_url, paper_id, access_key)
 
 
 
 
 
 
 
 
40
  if not summary or not mindmap:
41
  print(f"Skipping paper with ID: {paper_id} (Summary/Mindmap not found)")
42
  continue
43
  try:
44
+ title = fix_text(title)
45
+ summary = fix_text(summary)
46
+ mindmap = fix_text(mindmap)
47
+ citation = fix_text(citation)
48
  title = html.escape(str(title).strip())
49
  citation = html.escape(str(citation).strip())
50
  status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
summarize_paper.py CHANGED
@@ -1,32 +1,28 @@
 
1
  import json
2
  from gradio_client import Client
3
 
4
- def summarize_paper(paper_title, pdf_url, paper_id, paper_citation, access_key):
5
  mindmap = None
6
  summary = None
7
- title = None
8
- citation = None
9
  try:
10
- summarizer_client = Client("raannakasturi/ReXploreAPI")
 
 
 
11
  result = summarizer_client.predict(
12
  url=pdf_url,
13
- title=paper_title,
14
  id=paper_id,
15
- citation=paper_citation,
16
  access_key=access_key,
17
  api_name="/rexplore_summarizer"
18
  )
19
  if result:
20
  data = json.loads(result[0])
21
  print
22
- if data['title']:
23
- title = data['title']
24
- if data['citation']:
25
- citation = data['citation']
26
  if data["mindmap_status"] == "success":
27
  mindmap = data["mindmap"]
28
  if data["summary_status"] == "success":
29
  summary = data["summary"]
30
  except Exception as e:
31
  print(f"Error summarizing paper: {e}")
32
- return title, summary, mindmap, citation
 
1
+ import os
2
  import json
3
  from gradio_client import Client
4
 
5
+ def summarize_paper(pdf_url, paper_id, access_key):
6
  mindmap = None
7
  summary = None
 
 
8
  try:
9
+ summarizer_client = Client(
10
+ "raannakasturi/ReXploreAPI",
11
+ hf_token=os.environ.get("HF_API_TOKEN"),
12
+ )
13
  result = summarizer_client.predict(
14
  url=pdf_url,
 
15
  id=paper_id,
 
16
  access_key=access_key,
17
  api_name="/rexplore_summarizer"
18
  )
19
  if result:
20
  data = json.loads(result[0])
21
  print
 
 
 
 
22
  if data["mindmap_status"] == "success":
23
  mindmap = data["mindmap"]
24
  if data["summary_status"] == "success":
25
  summary = data["summary"]
26
  except Exception as e:
27
  print(f"Error summarizing paper: {e}")
28
+ return summary, mindmap