|
import os |
|
import re |
|
|
|
|
|
colon = ":" |
|
comma = "," |
|
exclamation_mark = "!" |
|
period = re.escape(".") |
|
question_mark = re.escape("?") |
|
semicolon = ";" |
|
|
|
left_curly_bracket = "{" |
|
right_curly_bracket = "}" |
|
quotation_mark = '"' |
|
|
|
basic_punc = ( |
|
period |
|
+ question_mark |
|
+ comma |
|
+ colon |
|
+ exclamation_mark |
|
+ left_curly_bracket |
|
+ right_curly_bracket |
|
) |
|
|
|
|
|
zero_width_space = r"\u200B" |
|
zero_width_nonjoiner = r"\u200C" |
|
left_to_right_mark = r"\u200E" |
|
right_to_left_mark = r"\u200F" |
|
left_to_right_embedding = r"\u202A" |
|
pop_directional_formatting = r"\u202C" |
|
|
|
|
|
right_single_quotation_mark = r"\u2019" |
|
left_single_quotation_mark = r"\u2018" |
|
|
|
|
|
|
|
inverted_exclamation_mark = r"\u00A1" |
|
inverted_question_mark = r"\u00BF" |
|
|
|
|
|
|
|
hindi_danda = u"\u0964" |
|
|
|
|
|
|
|
arabic_comma = r"\u060C" |
|
arabic_question_mark = r"\u061F" |
|
arabic_semicolon = r"\u061B" |
|
arabic_diacritics = r"\u064B-\u0652" |
|
|
|
|
|
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657" |
|
|
|
|
|
|
|
full_stop = r"\u3002" |
|
full_comma = r"\uFF0C" |
|
full_exclamation_mark = r"\uFF01" |
|
full_question_mark = r"\uFF1F" |
|
full_semicolon = r"\uFF1B" |
|
full_colon = r"\uFF1A" |
|
full_parentheses = r"\uFF08\uFF09" |
|
quotation_mark_horizontal = r"\u300C-\u300F" |
|
quotation_mark_vertical = r"\uFF41-\uFF44" |
|
title_marks = r"\u3008-\u300B" |
|
wavy_low_line = r"\uFE4F" |
|
ellipsis = r"\u22EF" |
|
enumeration_comma = r"\u3001" |
|
hyphenation_point = r"\u2027" |
|
forward_slash = r"\uFF0F" |
|
wavy_dash = r"\uFF5E" |
|
box_drawings_light_horizontal = r"\u2500" |
|
fullwidth_low_line = r"\uFF3F" |
|
chinese_punc = ( |
|
full_stop |
|
+ full_comma |
|
+ full_exclamation_mark |
|
+ full_question_mark |
|
+ full_semicolon |
|
+ full_colon |
|
+ full_parentheses |
|
+ quotation_mark_horizontal |
|
+ quotation_mark_vertical |
|
+ title_marks |
|
+ wavy_low_line |
|
+ ellipsis |
|
+ enumeration_comma |
|
+ hyphenation_point |
|
+ forward_slash |
|
+ wavy_dash |
|
+ box_drawings_light_horizontal |
|
+ fullwidth_low_line |
|
) |
|
|
|
|
|
armenian_apostrophe = r"\u055A" |
|
emphasis_mark = r"\u055B" |
|
exclamation_mark = r"\u055C" |
|
armenian_comma = r"\u055D" |
|
armenian_question_mark = r"\u055E" |
|
abbreviation_mark = r"\u055F" |
|
armenian_full_stop = r"\u0589" |
|
armenian_punc = ( |
|
armenian_apostrophe |
|
+ emphasis_mark |
|
+ exclamation_mark |
|
+ armenian_comma |
|
+ armenian_question_mark |
|
+ abbreviation_mark |
|
+ armenian_full_stop |
|
) |
|
|
|
lesser_than_symbol = r"<" |
|
greater_than_symbol = r">" |
|
|
|
lesser_than_sign = r"\u003c" |
|
greater_than_sign = r"\u003e" |
|
|
|
nbsp_written_form = r" " |
|
|
|
|
|
left_double_quotes = r"\u201c" |
|
right_double_quotes = r"\u201d" |
|
left_double_angle = r"\u00ab" |
|
right_double_angle = r"\u00bb" |
|
left_single_angle = r"\u2039" |
|
right_single_angle = r"\u203a" |
|
low_double_quotes = r"\u201e" |
|
low_single_quotes = r"\u201a" |
|
high_double_quotes = r"\u201f" |
|
high_single_quotes = r"\u201b" |
|
|
|
all_punct_quotes = ( |
|
left_double_quotes |
|
+ right_double_quotes |
|
+ left_double_angle |
|
+ right_double_angle |
|
+ left_single_angle |
|
+ right_single_angle |
|
+ low_double_quotes |
|
+ low_single_quotes |
|
+ high_double_quotes |
|
+ high_single_quotes |
|
+ right_single_quotation_mark |
|
+ left_single_quotation_mark |
|
) |
|
mapping_quotes = ( |
|
"[" |
|
+ high_single_quotes |
|
+ right_single_quotation_mark |
|
+ left_single_quotation_mark |
|
+ "]" |
|
) |
|
|
|
|
|
|
|
|
|
english_digits = r"\u0030-\u0039" |
|
bengali_digits = r"\u09e6-\u09ef" |
|
khmer_digits = r"\u17e0-\u17e9" |
|
devanagari_digits = r"\u0966-\u096f" |
|
oriya_digits = r"\u0b66-\u0b6f" |
|
extended_arabic_indic_digits = r"\u06f0-\u06f9" |
|
kayah_li_digits = r"\ua900-\ua909" |
|
fullwidth_digits = r"\uff10-\uff19" |
|
malayam_digits = r"\u0d66-\u0d6f" |
|
myanmar_digits = r"\u1040-\u1049" |
|
roman_numeral = r"\u2170-\u2179" |
|
nominal_digit_shapes = r"\u206f" |
|
|
|
|
|
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f: |
|
punc_list = punc_f.readlines() |
|
|
|
punct_pattern = r"" |
|
for punc in punc_list: |
|
|
|
punct_pattern += re.escape(punc.split("\t")[0]) |
|
|
|
shared_digits = ( |
|
english_digits |
|
+ bengali_digits |
|
+ khmer_digits |
|
+ devanagari_digits |
|
+ oriya_digits |
|
+ extended_arabic_indic_digits |
|
+ kayah_li_digits |
|
+ fullwidth_digits |
|
+ malayam_digits |
|
+ myanmar_digits |
|
+ roman_numeral |
|
+ nominal_digit_shapes |
|
) |
|
|
|
shared_punc_list = ( |
|
basic_punc |
|
+ all_punct_quotes |
|
+ greater_than_sign |
|
+ lesser_than_sign |
|
+ inverted_question_mark |
|
+ full_stop |
|
+ semicolon |
|
+ armenian_punc |
|
+ inverted_exclamation_mark |
|
+ arabic_comma |
|
+ enumeration_comma |
|
+ hindi_danda |
|
+ quotation_mark |
|
+ arabic_semicolon |
|
+ arabic_question_mark |
|
+ chinese_punc |
|
+ punct_pattern |
|
|
|
) |
|
|
|
shared_mappping = { |
|
lesser_than_symbol: "", |
|
greater_than_symbol: "", |
|
nbsp_written_form: "", |
|
r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2", |
|
} |
|
|
|
shared_deletion_list = ( |
|
left_to_right_mark |
|
+ zero_width_nonjoiner |
|
+ arabic_subscript_alef_and_inverted_damma |
|
+ zero_width_space |
|
+ arabic_diacritics |
|
+ pop_directional_formatting |
|
+ right_to_left_mark |
|
+ left_to_right_embedding |
|
) |
|
|
|
norm_config = { |
|
"*": { |
|
"lower_case": True, |
|
"punc_set": shared_punc_list, |
|
"del_set": shared_deletion_list, |
|
"mapping": shared_mappping, |
|
"digit_set": shared_digits, |
|
"unicode_norm": "NFKC", |
|
"rm_diacritics" : False, |
|
} |
|
} |
|
|
|
|
|
|
|
norm_config["mon"] = norm_config["*"].copy() |
|
|
|
norm_config["mon"]["del_set"] += r"\u00AD" |
|
|
|
norm_config["khk"] = norm_config["mon"].copy() |
|
|
|
|
|
|
|
norm_config["heb"] = norm_config["*"].copy() |
|
|
|
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF" |
|
|
|
|
|
|
|
norm_config["tha"] = norm_config["*"].copy() |
|
|
|
norm_config["tha"]["punc_set"] += r"\u200D" |
|
|
|
|
|
norm_config["ara"] = norm_config["*"].copy() |
|
norm_config["ara"]["mapping"]["ٱ"] = "ا" |
|
norm_config["arb"] = norm_config["ara"].copy() |
|
|
|
|
|
norm_config["jav"] = norm_config["*"].copy() |
|
norm_config["jav"]["rm_diacritics"] = True |
|
|
|
|