diff --git a/Jenkinsfile b/Jenkinsfile index d9c3a5984..ed82ba514 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-18-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency_singular.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency_singular.tsv new file mode 100644 index 000000000..af8d793f2 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/currency_singular.tsv @@ -0,0 +1,9 @@ +₹ रुपया +£ पाउंड +₩ वॉन +$ डॉलर +₺ लीरा +৳ टका +¥ येन +₦ नाइरा +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv index cf62891d1..a9186acc3 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -1,4 +1,5 @@ रुपए पैसे +रुपया पैसे पाउंड पेंस वॉन जिओन डॉलर सेंट @@ -6,4 +7,4 @@ टका पैसे येन सेन नाइरा कोबो -यूरो सेंट +यूरो सेंट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index 01e46352f..16f389ae7 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -16,18 +16,18 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space -from nemo_text_processing.text_normalization.hi.utils import get_abs_path +from nemo_text_processing.text_normalization.hi.utils import get_abs_path, load_labels currency_graph = pynini.string_file(get_abs_path("data/money/currency.tsv")) +currency_singular_graph = pynini.string_file(get_abs_path("data/money/currency_singular.tsv")) class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } - ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } - ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } - Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + ₹५० -> money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "पैसे" } + ₹०.५० -> { money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "पैसे" } Args: cardinal: CardinalFst @@ -41,30 +41,118 @@ def __init__(self, cardinal: GraphFst): cardinal_graph = cardinal.final_graph + _en_to_hi_digit = pynini.string_file(get_abs_path("data/ordinal/en_to_hi_digit.tsv")) + _deva_to_ascii = pynini.invert(_en_to_hi_digit) + deva_to_ascii = pynini.closure(_deva_to_ascii | pynini.union(*"0123456789"), 1) + + _ascii_digit = pynini.union(*"0123456789") + _ascii_nonzero = pynini.union(*"123456789") + _deva_nonzero = pynini.union(*"१२३४५६७८९") + _any_digit = _ascii_digit | pynini.union(*"०१२३४५६७८९") + _any_nonzero = _ascii_nonzero | _deva_nonzero + optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + pynutil.insert("negative: ") + pynini.cross("-", '"true"') + insert_space, 0, 1, ) + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + currency_major_singular = pynutil.insert('currency_maj: "') + currency_singular_graph + pynutil.insert('"') + + one = pynini.union("1", "१") + integer_one = pynutil.insert('integer_part: "') + (one @ cardinal_graph) + pynutil.insert('"') integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') - fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') - currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') - graph_major_only = optional_graph_negative + currency_major + insert_space + integer - graph_major_and_minor = ( + strip_trailing_zeros = pynini.closure(_ascii_digit) + _ascii_nonzero + pynini.closure(pynutil.delete("0")) + canonicalise = ( + (pynutil.delete("0") + _ascii_nonzero) + | (_ascii_nonzero + pynutil.insert("0")) + | (_ascii_nonzero + _ascii_digit) + ) + two_digits_fractional_part = deva_to_ascii @ strip_trailing_zeros @ canonicalise + + fraction = ( + pynutil.insert('fractional_part: "') + (two_digits_fractional_part @ cardinal_graph) + pynutil.insert('"') + ) + + optional_delete_fractional_zeros = pynini.closure( + pynutil.delete(".") + pynini.closure(pynutil.delete("0") | pynutil.delete("०"), 1), + 0, + 1, + ) + + has_3plus_sig_digits = _any_digit + _any_digit + _any_nonzero + pynini.closure(_any_digit) + single_digit = _any_digit @ cardinal.single_digits_graph + decimal_digits = ( + pynutil.insert('fractional_part: "') + + single_digit + + pynini.closure(insert_space + single_digit) + + pynutil.insert('"') + ) + guarded_decimal_digits = has_3plus_sig_digits @ decimal_digits + + graph_decimal_path = ( optional_graph_negative + currency_major + insert_space - + integer + + pynutil.insert('integer_part: "') + + cardinal_graph + + pynutil.insert('"') + pynini.cross(".", " ") - + fraction + + guarded_decimal_digits + ).optimize() + + graph_major_only_singular = ( + optional_graph_negative + + currency_major_singular + insert_space - + currency_minor - ) + + integer_one + + optional_delete_fractional_zeros + ).optimize() + + graph_major_only = ( + optional_graph_negative + currency_major + insert_space + integer + optional_delete_fractional_zeros + ).optimize() + + maj_labels = load_labels(get_abs_path("data/money/currency.tsv")) + maj_singular_labels = load_labels(get_abs_path("data/money/currency_singular.tsv")) + maj_to_min = dict(load_labels(get_abs_path("data/money/major_minor_currencies.tsv"))) + + def _build_major_and_minor(sym_maj_labels, int_graph): + result = None + for sym, maj in sym_maj_labels: + min_name = maj_to_min.get(maj) + if not min_name: + continue - graph_currencies = graph_major_only | graph_major_and_minor + curr_maj = pynutil.insert('currency_maj: "') + pynini.cross(sym, maj) + pynutil.insert('"') + curr_min = pynutil.insert('currency_min: "') + pynutil.insert(min_name) + pynutil.insert('"') + + g = ( + optional_graph_negative + + curr_maj + + insert_space + + int_graph + + pynini.cross(".", " ") + + fraction + + insert_space + + curr_min + ).optimize() + + result = g if result is None else pynini.union(result, g).optimize() + + return result + + graph_major_and_minor = _build_major_and_minor(maj_labels, integer) + graph_major_and_minor_singular = _build_major_and_minor(maj_singular_labels, integer_one) + + graph_currencies = ( + pynutil.add_weight(graph_major_only_singular | graph_major_and_minor_singular, -0.001) + | pynutil.add_weight(graph_decimal_path, -0.0005) + | graph_major_only + | graph_major_and_minor + ) graph = graph_currencies.optimize() - final_graph = self.add_tokens(graph) - self.fst = final_graph + self.fst = self.add_tokens(graph) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index 048140295..1e5da99e4 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,26 +15,16 @@ import pynini from pynini.lib import pynutil -major_minor_currencies = { - "रुपए": "पैसे", - "पाउंड": "पेंस", - "वॉन": "जिओन", - "डॉलर": "सेंट", - "लीरा": "कुरस", - "टका": "पैसे", - "येन": "सेन", - "नाइरा": "कोबो", - "यूरो": "सेंट", -} from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path, load_labels class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे - money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे + money { currency_maj: "रुपए" integer_part: "बारह" } } -> बारह रुपए + money { currency_maj: "रुपए" integer_part: "बारह" fractional_part: "पचास" currency_min: "पैसे" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "पैसे" } -> पचास पैसे Args: cardinal: CardinalFst @@ -46,55 +36,67 @@ class MoneyFst(GraphFst): def __init__(self): super().__init__(name="money", kind="verbalize") - currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + sp = pynini.accep(NEMO_SPACE) + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - fractional_part = ( pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - # Handles major denominations only - graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major + currency_minor = pynutil.delete('currency_min: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + graph_major_only = integer_part + sp + currency_major + + all_major_names = [maj for maj, _ in load_labels(get_abs_path("data/money/major_minor_currencies.tsv"))] - # Handles both major and minor denominations major_minor_graphs = [] + minor_only_graphs = [] + + for major in all_major_names: + graph_major_slot = pynutil.delete('currency_maj: "') + pynutil.delete(major) + pynutil.delete('"') + + major_minor_graphs.append( + graph_major_slot + + sp + + integer_part + + pynutil.insert(NEMO_SPACE) + + pynutil.insert(major) + + sp + + fractional_part + + sp + + currency_minor + ) - # Handles minor denominations only - minor_graphs = [] - - # Logic for handling minor denominations - for major, minor in major_minor_currencies.items(): - graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') - graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') - graph_major_minor_partial = ( - integer_part - + pynini.accep(NEMO_SPACE) - + graph_major - + pynini.accep(NEMO_SPACE) + minor_only_graphs.append( + graph_major_slot + + sp + + pynutil.delete('integer_part: "शून्य"') + + sp + fractional_part - + pynini.accep(NEMO_SPACE) - + graph_minor + + sp + + currency_minor ) - major_minor_graphs.append(graph_major_minor_partial) - graph_minor_partial = ( - pynutil.delete('integer_part: "शून्य"') - + pynutil.delete(NEMO_SPACE) - + pynutil.delete('currency_maj: "') + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_only_graphs) + + decimal_graphs = [] + for major in all_major_names: + decimal_graphs.append( + pynutil.delete('currency_maj: "') + pynutil.delete(major) + pynutil.delete('"') - + pynutil.delete(NEMO_SPACE) + + sp + + integer_part + + sp + + pynutil.insert(" दशमलव ") + fractional_part - + pynini.accep(NEMO_SPACE) - + graph_minor + + pynutil.insert(NEMO_SPACE) + + pynutil.insert(major) ) - minor_graphs.append(graph_minor_partial) - - graph_major_minor = pynini.union(*major_minor_graphs) - graph_minor_only = pynini.union(*minor_graphs) + graph_decimal_money = pynini.union(*decimal_graphs) - graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) | graph_decimal_money - delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() + self.fst = self.delete_tokens(graph).optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index 0b199ff37..43985374a 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -116,3 +116,11 @@ $९.९९~नौ डॉलर निन्यानबे सेंट ₦१०.२७~दस नाइरा सत्ताईस कोबो €200.90~दो सौ यूरो नब्बे सेंट €१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट +$1.12~एक डॉलर बारह सेंट +$1.123~एक दशमलव एक दो तीन डॉलर +$1.1234~एक दशमलव एक दो तीन चार डॉलर +₹2.2000~दो रुपए बीस पैसे +$1.2000~एक डॉलर बीस सेंट +₹1.500~एक रुपया पचास पैसे +₹5.00~पाँच रुपए +₹१~एक रुपया \ No newline at end of file