import pynini
|
from fun_text_processing.text_normalization.en.graph_utils import (
|
DAMO_ALPHA,
|
DAMO_DIGIT,
|
DAMO_SIGMA,
|
DAMO_SPACE,
|
DAMO_WHITE_SPACE,
|
GraphFst,
|
delete_space,
|
insert_space,
|
)
|
from fun_text_processing.text_normalization.es.graph_utils import cardinal_separator
|
from fun_text_processing.text_normalization.es.utils import get_abs_path
|
from pynini.lib import pynutil
|
|
zero = pynini.invert(pynini.string_file(get_abs_path("data/numbers/zero.tsv")))
|
digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
|
teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv")))
|
ties = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv")))
|
twenties = pynini.invert(pynini.string_file(get_abs_path("data/numbers/twenties.tsv")))
|
hundreds = pynini.invert(pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")))
|
|
|
def filter_punctuation(fst: "pynini.FstLike") -> "pynini.FstLike":
|
"""
|
Helper function for parsing number strings. Converts common cardinal strings (groups of three digits delineated by 'cardinal_separator' - see graph_utils)
|
and converts to a string of digits:
|
"1 000" -> "1000"
|
"1.000.000" -> "1000000"
|
Args:
|
fst: Any pynini.FstLike object. Function composes fst onto string parser fst
|
|
Returns:
|
fst: A pynini.FstLike object
|
"""
|
exactly_three_digits = DAMO_DIGIT**3 # for blocks of three
|
up_to_three_digits = pynini.closure(DAMO_DIGIT, 1, 3) # for start of string
|
|
cardinal_string = pynini.closure(
|
DAMO_DIGIT, 1
|
) # For string w/o punctuation (used for page numbers, thousand series)
|
|
cardinal_string |= (
|
up_to_three_digits
|
+ pynutil.delete(cardinal_separator)
|
+ pynini.closure(exactly_three_digits + pynutil.delete(cardinal_separator))
|
+ exactly_three_digits
|
)
|
|
return cardinal_string @ fst
|
|
|
class CardinalFst(GraphFst):
|
"""
|
Finite state transducer for classifying cardinals, e.g.
|
"1000" -> cardinal { integer: "mil" }
|
"2.000.000" -> cardinal { integer: "dos millones" }
|
|
Args:
|
deterministic: if True will provide a single transduction option,
|
for False multiple transduction are generated (used for audio-based normalization)
|
"""
|
|
def __init__(self, deterministic: bool = True):
|
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
|
|
# Any single digit
|
graph_digit = digit
|
digits_no_one = (DAMO_DIGIT - "1") @ graph_digit
|
|
# Any double digit
|
graph_tens = teen
|
graph_tens |= ties + (pynutil.delete("0") | (pynutil.insert(" y ") + graph_digit))
|
graph_tens |= twenties
|
|
self.tens = graph_tens.optimize()
|
|
self.two_digit_non_zero = pynini.union(
|
graph_digit, graph_tens, (pynini.cross("0", DAMO_SPACE) + graph_digit)
|
).optimize()
|
|
# Three digit strings
|
graph_hundreds = hundreds + pynini.union(
|
pynutil.delete("00"),
|
(insert_space + graph_tens),
|
(pynini.cross("0", DAMO_SPACE) + graph_digit),
|
)
|
graph_hundreds |= pynini.cross("100", "cien")
|
graph_hundreds |= (
|
pynini.cross("1", "ciento")
|
+ insert_space
|
+ pynini.union(graph_tens, pynutil.delete("0") + graph_digit)
|
)
|
|
self.hundreds = graph_hundreds.optimize()
|
|
# For all three digit strings with leading zeroes (graph appends '0's to manage place in string)
|
graph_hundreds_component = pynini.union(graph_hundreds, pynutil.delete("0") + graph_tens)
|
|
graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component | (
|
pynutil.delete("00") + graph_digit
|
)
|
graph_hundreds_component_at_least_one_none_zero_digit_no_one = graph_hundreds_component | (
|
pynutil.delete("00") + digits_no_one
|
)
|
|
graph_thousands_component_at_least_one_none_zero_digit = pynini.union(
|
pynutil.delete("000") + graph_hundreds_component_at_least_one_none_zero_digit,
|
graph_hundreds_component_at_least_one_none_zero_digit_no_one
|
+ pynutil.insert(" mil")
|
+ (
|
(insert_space + graph_hundreds_component_at_least_one_none_zero_digit)
|
| pynutil.delete("000")
|
),
|
pynini.cross("001", "mil")
|
+ (
|
(insert_space + graph_hundreds_component_at_least_one_none_zero_digit)
|
| pynutil.delete("000")
|
),
|
)
|
|
graph_thousands_component_at_least_one_none_zero_digit_no_one = pynini.union(
|
pynutil.delete("000") + graph_hundreds_component_at_least_one_none_zero_digit_no_one,
|
graph_hundreds_component_at_least_one_none_zero_digit_no_one
|
+ pynutil.insert(" mil")
|
+ (
|
(insert_space + graph_hundreds_component_at_least_one_none_zero_digit)
|
| pynutil.delete("000")
|
),
|
pynini.cross("001", "mil")
|
+ (
|
(insert_space + graph_hundreds_component_at_least_one_none_zero_digit)
|
| pynutil.delete("000")
|
),
|
)
|
|
graph_million = pynutil.add_weight(pynini.cross("000001", "un millón"), -0.001)
|
graph_million |= (
|
graph_thousands_component_at_least_one_none_zero_digit_no_one
|
+ pynutil.insert(" millones")
|
)
|
graph_million |= pynutil.delete("000000")
|
graph_million += insert_space
|
|
graph_billion = pynutil.add_weight(pynini.cross("000001", "un billón"), -0.001)
|
graph_billion |= (
|
graph_thousands_component_at_least_one_none_zero_digit_no_one
|
+ pynutil.insert(" billones")
|
)
|
graph_billion |= pynutil.delete("000000")
|
graph_billion += insert_space
|
|
graph_trillion = pynutil.add_weight(pynini.cross("000001", "un trillón"), -0.001)
|
graph_trillion |= (
|
graph_thousands_component_at_least_one_none_zero_digit_no_one
|
+ pynutil.insert(" trillones")
|
)
|
graph_trillion |= pynutil.delete("000000")
|
graph_trillion += insert_space
|
|
graph = (
|
graph_trillion
|
+ graph_billion
|
+ graph_million
|
+ (graph_thousands_component_at_least_one_none_zero_digit | pynutil.delete("000000"))
|
)
|
|
self.graph = (
|
((DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT, 0))
|
@ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", DAMO_SIGMA)
|
@ DAMO_DIGIT**24
|
@ graph
|
@ pynini.cdrewrite(delete_space, "[BOS]", "", DAMO_SIGMA)
|
@ pynini.cdrewrite(delete_space, "", "[EOS]", DAMO_SIGMA)
|
@ pynini.cdrewrite(
|
pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 2), DAMO_SPACE),
|
DAMO_ALPHA,
|
DAMO_ALPHA,
|
DAMO_SIGMA,
|
)
|
)
|
self.graph |= zero
|
|
self.graph = filter_punctuation(self.graph).optimize()
|
|
optional_minus_graph = pynini.closure(
|
pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1
|
)
|
|
final_graph = (
|
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
|
)
|
|
final_graph = self.add_tokens(final_graph)
|
self.fst = final_graph.optimize()
|