ssl-aasist / fairseq /examples /mms /data_prep /norm_config.py

Add files using upload-large-folder tool

9742bb8 verified 5 months ago

6.53 kB

	import os
	import re


	colon = ":"
	comma = ","
	exclamation_mark = "!"
	period = re.escape(".")
	question_mark = re.escape("?")
	semicolon = ";"

	left_curly_bracket = "{"
	right_curly_bracket = "}"
	quotation_mark = '"'

	basic_punc = (
	period
	+ question_mark
	+ comma
	+ colon
	+ exclamation_mark
	+ left_curly_bracket
	+ right_curly_bracket
	)

	# General punc unicode block (0x2000-0x206F)
	zero_width_space = r"\u200B"
	zero_width_nonjoiner = r"\u200C"
	left_to_right_mark = r"\u200E"
	right_to_left_mark = r"\u200F"
	left_to_right_embedding = r"\u202A"
	pop_directional_formatting = r"\u202C"

	# Here are some commonly ill-typed versions of apostrophe
	right_single_quotation_mark = r"\u2019"
	left_single_quotation_mark = r"\u2018"

	# Language specific definitions
	# Spanish
	inverted_exclamation_mark = r"\u00A1"
	inverted_question_mark = r"\u00BF"


	# Hindi
	hindi_danda = u"\u0964"

	# Egyptian Arabic
	# arabic_percent = r"\u066A"
	arabic_comma = r"\u060C"
	arabic_question_mark = r"\u061F"
	arabic_semicolon = r"\u061B"
	arabic_diacritics = r"\u064B-\u0652"


	arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"


	# Chinese
	full_stop = r"\u3002"
	full_comma = r"\uFF0C"
	full_exclamation_mark = r"\uFF01"
	full_question_mark = r"\uFF1F"
	full_semicolon = r"\uFF1B"
	full_colon = r"\uFF1A"
	full_parentheses = r"\uFF08\uFF09"
	quotation_mark_horizontal = r"\u300C-\u300F"
	quotation_mark_vertical = r"\uFF41-\uFF44"
	title_marks = r"\u3008-\u300B"
	wavy_low_line = r"\uFE4F"
	ellipsis = r"\u22EF"
	enumeration_comma = r"\u3001"
	hyphenation_point = r"\u2027"
	forward_slash = r"\uFF0F"
	wavy_dash = r"\uFF5E"
	box_drawings_light_horizontal = r"\u2500"
	fullwidth_low_line = r"\uFF3F"
	chinese_punc = (
	full_stop
	+ full_comma
	+ full_exclamation_mark
	+ full_question_mark
	+ full_semicolon
	+ full_colon
	+ full_parentheses
	+ quotation_mark_horizontal
	+ quotation_mark_vertical
	+ title_marks
	+ wavy_low_line
	+ ellipsis
	+ enumeration_comma
	+ hyphenation_point
	+ forward_slash
	+ wavy_dash
	+ box_drawings_light_horizontal
	+ fullwidth_low_line
	)

	# Armenian
	armenian_apostrophe = r"\u055A"
	emphasis_mark = r"\u055B"
	exclamation_mark = r"\u055C"
	armenian_comma = r"\u055D"
	armenian_question_mark = r"\u055E"
	abbreviation_mark = r"\u055F"
	armenian_full_stop = r"\u0589"
	armenian_punc = (
	armenian_apostrophe
	+ emphasis_mark
	+ exclamation_mark
	+ armenian_comma
	+ armenian_question_mark
	+ abbreviation_mark
	+ armenian_full_stop
	)

	lesser_than_symbol = r"<"
	greater_than_symbol = r">"

	lesser_than_sign = r"\u003c"
	greater_than_sign = r"\u003e"

	nbsp_written_form = r"&nbsp"

	# Quotation marks
	left_double_quotes = r"\u201c"
	right_double_quotes = r"\u201d"
	left_double_angle = r"\u00ab"
	right_double_angle = r"\u00bb"
	left_single_angle = r"\u2039"
	right_single_angle = r"\u203a"
	low_double_quotes = r"\u201e"
	low_single_quotes = r"\u201a"
	high_double_quotes = r"\u201f"
	high_single_quotes = r"\u201b"

	all_punct_quotes = (
	left_double_quotes
	+ right_double_quotes
	+ left_double_angle
	+ right_double_angle
	+ left_single_angle
	+ right_single_angle
	+ low_double_quotes
	+ low_single_quotes
	+ high_double_quotes
	+ high_single_quotes
	+ right_single_quotation_mark
	+ left_single_quotation_mark
	)
	mapping_quotes = (
	"["
	+ high_single_quotes
	+ right_single_quotation_mark
	+ left_single_quotation_mark
	+ "]"
	)


	# Digits

	english_digits = r"\u0030-\u0039"
	bengali_digits = r"\u09e6-\u09ef"
	khmer_digits = r"\u17e0-\u17e9"
	devanagari_digits = r"\u0966-\u096f"
	oriya_digits = r"\u0b66-\u0b6f"
	extended_arabic_indic_digits = r"\u06f0-\u06f9"
	kayah_li_digits = r"\ua900-\ua909"
	fullwidth_digits = r"\uff10-\uff19"
	malayam_digits = r"\u0d66-\u0d6f"
	myanmar_digits = r"\u1040-\u1049"
	roman_numeral = r"\u2170-\u2179"
	nominal_digit_shapes = r"\u206f"

	# Load punctuations from MMS-lab data
	with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
	punc_list = punc_f.readlines()

	punct_pattern = r""
	for punc in punc_list:
	# the first character in the tab separated line is the punc to be removed
	punct_pattern += re.escape(punc.split("\t")[0])

	shared_digits = (
	english_digits
	+ bengali_digits
	+ khmer_digits
	+ devanagari_digits
	+ oriya_digits
	+ extended_arabic_indic_digits
	+ kayah_li_digits
	+ fullwidth_digits
	+ malayam_digits
	+ myanmar_digits
	+ roman_numeral
	+ nominal_digit_shapes
	)

	shared_punc_list = (
	basic_punc
	+ all_punct_quotes
	+ greater_than_sign
	+ lesser_than_sign
	+ inverted_question_mark
	+ full_stop
	+ semicolon
	+ armenian_punc
	+ inverted_exclamation_mark
	+ arabic_comma
	+ enumeration_comma
	+ hindi_danda
	+ quotation_mark
	+ arabic_semicolon
	+ arabic_question_mark
	+ chinese_punc
	+ punct_pattern

	)

	shared_mappping = {
	lesser_than_symbol: "",
	greater_than_symbol: "",
	nbsp_written_form: "",
	r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
	}

	shared_deletion_list = (
	left_to_right_mark
	+ zero_width_nonjoiner
	+ arabic_subscript_alef_and_inverted_damma
	+ zero_width_space
	+ arabic_diacritics
	+ pop_directional_formatting
	+ right_to_left_mark
	+ left_to_right_embedding
	)

	norm_config = {
	"*": {
	"lower_case": True,
	"punc_set": shared_punc_list,
	"del_set": shared_deletion_list,
	"mapping": shared_mappping,
	"digit_set": shared_digits,
	"unicode_norm": "NFKC",
	"rm_diacritics" : False,
	}
	}

	#=============== Mongolian ===============#

	norm_config["mon"] = norm_config["*"].copy()
	# add soft hyphen to punc list to match with fleurs
	norm_config["mon"]["del_set"] += r"\u00AD"

	norm_config["khk"] = norm_config["mon"].copy()

	#=============== Hebrew ===============#

	norm_config["heb"] = norm_config["*"].copy()
	# add "HEBREW POINT" symbols to match with fleurs
	norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"

	#=============== Thai ===============#

	norm_config["tha"] = norm_config["*"].copy()
	# add "Zero width joiner" symbols to match with fleurs
	norm_config["tha"]["punc_set"] += r"\u200D"

	#=============== Arabic ===============#
	norm_config["ara"] = norm_config["*"].copy()
	norm_config["ara"]["mapping"]["ٱ"] = "ا"
	norm_config["arb"] = norm_config["ara"].copy()

	#=============== Javanese ===============#
	norm_config["jav"] = norm_config["*"].copy()
	norm_config["jav"]["rm_diacritics"] = True