Spaces:
Running
Running
Commit
·
98434dd
1
Parent(s):
5bb5491
Refactor paper summarization to improve text handling and remove unused parameters
Browse files- main.py +16 -9
- summarize_paper.py +7 -11
main.py
CHANGED
@@ -10,6 +10,16 @@ from send_mail import send_email
|
|
10 |
dotenv.load_dotenv()
|
11 |
access_key = os.getenv("ACCESS_KEY")
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def paper_data(paper_data, wait_time=5):
|
14 |
data = {"status": "success"}
|
15 |
data['data'] = {}
|
@@ -21,23 +31,20 @@ def paper_data(paper_data, wait_time=5):
|
|
21 |
doi = details.get("doi")
|
22 |
pdf_url = details.get("pdf_url")
|
23 |
title = details.get("title")
|
|
|
24 |
citation = details.get("citation")
|
25 |
if not all([paper_id, doi, pdf_url, title, citation]):
|
26 |
print(f"Skipping paper with ID: {paper_id} (missing details)")
|
27 |
continue
|
28 |
-
|
29 |
-
if not fixed_title:
|
30 |
-
title = title
|
31 |
-
else:
|
32 |
-
title = fixed_title
|
33 |
-
if not fixed_citation:
|
34 |
-
citation = citation
|
35 |
-
else:
|
36 |
-
citation = fixed_citation
|
37 |
if not summary or not mindmap:
|
38 |
print(f"Skipping paper with ID: {paper_id} (Summary/Mindmap not found)")
|
39 |
continue
|
40 |
try:
|
|
|
|
|
|
|
|
|
41 |
title = html.escape(str(title).strip())
|
42 |
citation = html.escape(str(citation).strip())
|
43 |
status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
|
|
|
10 |
dotenv.load_dotenv()
|
11 |
access_key = os.getenv("ACCESS_KEY")
|
12 |
|
13 |
+
def fix_text(text):
|
14 |
+
text = html.escape(text.encode('utf-8').decode('utf-8').replace("â¦", "..., "))
|
15 |
+
fixed_text = ""
|
16 |
+
for word in text.split():
|
17 |
+
try:
|
18 |
+
fixed_text += word.encode('latin1').decode('utf-8')+" "
|
19 |
+
except:
|
20 |
+
fixed_text += word+" "
|
21 |
+
return fixed_text.encode('utf-8').decode().replace('\\', '\\\\')
|
22 |
+
|
23 |
def paper_data(paper_data, wait_time=5):
|
24 |
data = {"status": "success"}
|
25 |
data['data'] = {}
|
|
|
31 |
doi = details.get("doi")
|
32 |
pdf_url = details.get("pdf_url")
|
33 |
title = details.get("title")
|
34 |
+
title = html.escape(title.encode('unicode-escape').decode().replace('\\\\', '\\'))
|
35 |
citation = details.get("citation")
|
36 |
if not all([paper_id, doi, pdf_url, title, citation]):
|
37 |
print(f"Skipping paper with ID: {paper_id} (missing details)")
|
38 |
continue
|
39 |
+
summary, mindmap = summarize_paper(pdf_url, paper_id, access_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
if not summary or not mindmap:
|
41 |
print(f"Skipping paper with ID: {paper_id} (Summary/Mindmap not found)")
|
42 |
continue
|
43 |
try:
|
44 |
+
title = fix_text(title)
|
45 |
+
summary = fix_text(summary)
|
46 |
+
mindmap = fix_text(mindmap)
|
47 |
+
citation = fix_text(citation)
|
48 |
title = html.escape(str(title).strip())
|
49 |
citation = html.escape(str(citation).strip())
|
50 |
status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
|
summarize_paper.py
CHANGED
@@ -1,32 +1,28 @@
|
|
|
|
1 |
import json
|
2 |
from gradio_client import Client
|
3 |
|
4 |
-
def summarize_paper(
|
5 |
mindmap = None
|
6 |
summary = None
|
7 |
-
title = None
|
8 |
-
citation = None
|
9 |
try:
|
10 |
-
summarizer_client = Client(
|
|
|
|
|
|
|
11 |
result = summarizer_client.predict(
|
12 |
url=pdf_url,
|
13 |
-
title=paper_title,
|
14 |
id=paper_id,
|
15 |
-
citation=paper_citation,
|
16 |
access_key=access_key,
|
17 |
api_name="/rexplore_summarizer"
|
18 |
)
|
19 |
if result:
|
20 |
data = json.loads(result[0])
|
21 |
print
|
22 |
-
if data['title']:
|
23 |
-
title = data['title']
|
24 |
-
if data['citation']:
|
25 |
-
citation = data['citation']
|
26 |
if data["mindmap_status"] == "success":
|
27 |
mindmap = data["mindmap"]
|
28 |
if data["summary_status"] == "success":
|
29 |
summary = data["summary"]
|
30 |
except Exception as e:
|
31 |
print(f"Error summarizing paper: {e}")
|
32 |
-
return
|
|
|
1 |
+
import os
|
2 |
import json
|
3 |
from gradio_client import Client
|
4 |
|
5 |
+
def summarize_paper(pdf_url, paper_id, access_key):
|
6 |
mindmap = None
|
7 |
summary = None
|
|
|
|
|
8 |
try:
|
9 |
+
summarizer_client = Client(
|
10 |
+
"raannakasturi/ReXploreAPI",
|
11 |
+
hf_token=os.environ.get("HF_API_TOKEN"),
|
12 |
+
)
|
13 |
result = summarizer_client.predict(
|
14 |
url=pdf_url,
|
|
|
15 |
id=paper_id,
|
|
|
16 |
access_key=access_key,
|
17 |
api_name="/rexplore_summarizer"
|
18 |
)
|
19 |
if result:
|
20 |
data = json.loads(result[0])
|
21 |
print
|
|
|
|
|
|
|
|
|
22 |
if data["mindmap_status"] == "success":
|
23 |
mindmap = data["mindmap"]
|
24 |
if data["summary_status"] == "success":
|
25 |
summary = data["summary"]
|
26 |
except Exception as e:
|
27 |
print(f"Error summarizing paper: {e}")
|
28 |
+
return summary, mindmap
|