raannakasturi commited on
Commit
6440752
·
1 Parent(s): 10266ab

Fix encoding issues in paper_data function and add tests for text decoding

Browse files
Files changed (2) hide show
  1. main.py +9 -3
  2. test.py +7 -0
main.py CHANGED
@@ -31,14 +31,20 @@ def paper_data(paper_data, wait_time=5):
31
  continue
32
  try:
33
  try:
34
- escaped_title = repr(title.encode('latin1').decode('unicode-escape')).strip()
 
 
 
35
  except:
36
  escaped_title = repr(title).strip()
37
  title = html.escape(str(escaped_title).strip()[1:-1])
38
  try:
39
- encoded_bytes = citation.encode('latin1').decode('utf-8', errors='replace')
 
 
 
40
  except:
41
- encoded_bytes = repr(citation)
42
  citation = html.unescape(encoded_bytes)
43
  status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
44
  except Exception as e:
 
31
  continue
32
  try:
33
  try:
34
+ try:
35
+ escaped_title = repr(title.encode('latin1').decode('unicode-escape', errors='replace')).strip()
36
+ except:
37
+ escaped_title = repr(title).strip().encode('latin1', errors='replace').decode('utf-8', errors='replace')
38
  except:
39
  escaped_title = repr(title).strip()
40
  title = html.escape(str(escaped_title).strip()[1:-1])
41
  try:
42
+ try:
43
+ encoded_bytes = citation.encode('latin1').decode('unicode-escape', errors='replace')
44
+ except:
45
+ encoded_bytes = repr(citation).strip().encode('latin1').decode('utf-8', errors='replace')
46
  except:
47
+ encoded_bytes = repr(citation).strip()
48
  citation = html.unescape(encoded_bytes)
49
  status = post_blog(doi, title, category, summary, mindmap, citation, access_key, wait_time)
50
  except Exception as e:
test.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Original text with incorrect encoding
2
+ text = "Itâ\x80\x99s the AIâ\x80\x99s fault, not mine: Mind perception increases blame attribution to AI"
3
+
4
+ # Decode as 'latin1' and re-encode as 'utf-8'
5
+ fixed_text = text.encode('latin1').decode('utf-8')
6
+
7
+ print(fixed_text)