| | |
| | | |
| | | import numpy as np |
| | | import torch |
| | | import os |
| | | |
| | | from funasr.build_utils.build_model_from_file import build_model_from_file |
| | | from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor |
| | |
| | | self.punc_list[i] = "?" |
| | | elif self.punc_list[i] == "。": |
| | | self.period = i |
| | | self.seg_dict_file = None |
| | | self.seg_jieba = False |
| | | if "seg_jieba" in train_args: |
| | | self.seg_jieba = train_args.seg_jieba |
| | | self.seg_dict_file = os.path.dirname(model_file)+"/"+ "jieba_usr_dict" |
| | | self.preprocessor = CodeMixTokenizerCommonPreprocessor( |
| | | train=False, |
| | | token_type=train_args.token_type, |
| | |
| | | g2p_type=train_args.g2p, |
| | | text_name="text", |
| | | non_linguistic_symbols=train_args.non_linguistic_symbols, |
| | | seg_jieba=self.seg_jieba, |
| | | seg_dict_file=self.seg_dict_file |
| | | ) |
| | | |
| | | @torch.no_grad() |
| | |
| | | new_mini_sentence_punc += [int(x) for x in punctuations_np] |
| | | words_with_punc = [] |
| | | for i in range(len(mini_sentence)): |
| | | if (i==0 or self.punc_list[punctuations[i-1]] == "。" or self.punc_list[punctuations[i-1]] == "?") and len(mini_sentence[i][0].encode()) == 1: |
| | | mini_sentence[i] = mini_sentence[i].capitalize() |
| | | if i == 0: |
| | | if len(mini_sentence[i][0].encode()) == 1: |
| | | mini_sentence[i] = " " + mini_sentence[i] |
| | | if i > 0: |
| | | if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1: |
| | | mini_sentence[i] = " " + mini_sentence[i] |
| | | words_with_punc.append(mini_sentence[i]) |
| | | if self.punc_list[punctuations[i]] != "_": |
| | | words_with_punc.append(self.punc_list[punctuations[i]]) |
| | | punc_res = self.punc_list[punctuations[i]] |
| | | if len(mini_sentence[i][0].encode()) == 1: |
| | | if punc_res == ",": |
| | | punc_res = "," |
| | | elif punc_res == "。": |
| | | punc_res = "." |
| | | elif punc_res == "?": |
| | | punc_res = "?" |
| | | words_with_punc.append(punc_res) |
| | | new_mini_sentence += "".join(words_with_punc) |
| | | # Add Period for the end of the sentence |
| | | new_mini_sentence_out = new_mini_sentence |
| | |
| | | if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、": |
| | | new_mini_sentence_out = new_mini_sentence[:-1] + "。" |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] |
| | | elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?": |
| | | elif new_mini_sentence[-1] == ",": |
| | | new_mini_sentence_out = new_mini_sentence[:-1] + "." |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] |
| | | elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())==0: |
| | | new_mini_sentence_out = new_mini_sentence + "。" |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] |
| | | elif new_mini_sentence[-1] != "." and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())==1: |
| | | new_mini_sentence_out = new_mini_sentence + "." |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] |
| | | return new_mini_sentence_out, new_mini_sentence_punc_out |
| | | |
| | | |