import pynini
|
from fun_text_processing.text_normalization.en.graph_utils import (
|
DAMO_CHAR,
|
DAMO_DIGIT,
|
DAMO_SIGMA,
|
DAMO_SPACE,
|
GraphFst,
|
)
|
from fun_text_processing.text_normalization.es.utils import get_abs_path
|
from pynini.lib import pynutil
|
|
ordinal_exceptions = pynini.string_file(get_abs_path("data/fractions/ordinal_exceptions.tsv"))
|
higher_powers_of_ten = pynini.string_file(get_abs_path("data/fractions/powers_of_ten.tsv"))
|
|
|
class FractionFst(GraphFst):
|
"""
|
Finite state transducer for classifying fraction
|
"23 4/5" ->
|
tokens { fraction { integer: "veintitrés" numerator: "cuatro" denominator: "quinto" mophosyntactic_features: "ordinal" } }
|
|
Args:
|
cardinal: CardinalFst
|
ordinal: OrdinalFst
|
deterministic: if True will provide a single transduction option,
|
for False multiple transduction are generated (used for audio-based normalization)
|
"""
|
|
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True):
|
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
|
cardinal_graph = cardinal.graph
|
ordinal_graph = ordinal.graph
|
|
# 2-10 are all ordinals
|
three_to_ten = pynini.string_map(
|
[
|
"2",
|
"3",
|
"4",
|
"5",
|
"6",
|
"7",
|
"8",
|
"9",
|
"10",
|
]
|
)
|
block_three_to_ten = pynutil.delete(three_to_ten) # To block cardinal productions
|
if not deterministic: # Multiples of tens are sometimes rendered as ordinals
|
three_to_ten |= pynini.string_map(
|
[
|
"20",
|
"30",
|
"40",
|
"50",
|
"60",
|
"70",
|
"80",
|
"90",
|
]
|
)
|
graph_three_to_ten = three_to_ten @ ordinal_graph
|
graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", DAMO_SIGMA)
|
|
# Higher powers of tens (and multiples) are converted to ordinals.
|
hundreds = pynini.string_map(
|
[
|
"100",
|
"200",
|
"300",
|
"400",
|
"500",
|
"600",
|
"700",
|
"800",
|
"900",
|
]
|
)
|
graph_hundreds = hundreds @ ordinal_graph
|
|
multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos
|
|
graph_higher_powers_of_ten = (
|
pynini.closure(ordinal.one_to_one_thousand + DAMO_SPACE, 0, 1)
|
+ pynini.closure("mil ", 0, 1)
|
+ pynini.closure(ordinal.one_to_one_thousand + DAMO_SPACE, 0, 1)
|
) # x millones / x mil millones / x mil z millones
|
graph_higher_powers_of_ten += higher_powers_of_ten
|
graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten
|
graph_higher_powers_of_ten @= pynini.cdrewrite(
|
pynutil.delete("un "),
|
pynini.accep("[BOS]"),
|
pynini.project(higher_powers_of_ten, "output"),
|
DAMO_SIGMA,
|
) # we drop 'un' from these ordinals (millionths, not one-millionths)
|
|
graph_higher_powers_of_ten = (
|
multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten
|
)
|
block_higher_powers_of_ten = pynutil.delete(
|
pynini.project(graph_higher_powers_of_ten, "input")
|
) # For cardinal graph
|
|
graph_fractions_ordinals = graph_higher_powers_of_ten | graph_three_to_ten
|
graph_fractions_ordinals += pynutil.insert(
|
'" morphosyntactic_features: "ordinal"'
|
) # We note the root for processing later
|
|
# Blocking the digits and hundreds from Cardinal graph
|
graph_fractions_cardinals = pynini.cdrewrite(
|
block_three_to_ten | block_higher_powers_of_ten,
|
pynini.accep("[BOS]"),
|
pynini.accep("[EOS]"),
|
DAMO_SIGMA,
|
)
|
graph_fractions_cardinals @= DAMO_CHAR.plus @ pynini.cdrewrite(
|
pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), DAMO_SIGMA
|
) # Empty characters become '0' for DAMO_CHAR fst, so need to block
|
graph_fractions_cardinals @= cardinal_graph
|
graph_fractions_cardinals += pynutil.insert(
|
'" morphosyntactic_features: "add_root"'
|
) # blocking these entries to reduce erroneous possibilities in debugging
|
|
if deterministic:
|
graph_fractions_cardinals = (
|
pynini.closure(DAMO_DIGIT, 1, 2) @ graph_fractions_cardinals
|
) # Past hundreds the conventional scheme can be hard to read. For determinism we stop here
|
|
graph_denominator = pynini.union(
|
graph_fractions_ordinals,
|
graph_fractions_cardinals,
|
pynutil.add_weight(cardinal_graph + pynutil.insert('"'), 0.001),
|
) # Last form is simply recording the cardinal. Weighting so last resort
|
|
integer = (
|
pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + DAMO_SPACE
|
)
|
numerator = (
|
pynutil.insert('numerator: "')
|
+ cardinal_graph
|
+ (pynini.cross("/", '" ') | pynini.cross(" / ", '" '))
|
)
|
denominator = pynutil.insert('denominator: "') + graph_denominator
|
|
self.graph = pynini.closure(integer, 0, 1) + numerator + denominator
|
|
final_graph = self.add_tokens(self.graph)
|
self.fst = final_graph.optimize()
|