Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add
| | |
| | | results = speech2text(**batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest |
| | | results = [[" ", ["sil"], [2], hyp, 10, 6, []]] * nbest |
| | | time_end = time.time() |
| | | forward_time = time_end - time_beg |
| | | lfr_factor = results[0][-1] |
| | |
| | | |
| | | import numpy as np |
| | | import torch |
| | | import os |
| | | |
| | | from funasr.build_utils.build_model_from_file import build_model_from_file |
| | | from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor |
| | |
| | | self.punc_list[i] = "?" |
| | | elif self.punc_list[i] == "。": |
| | | self.period = i |
| | | self.seg_dict_file = None |
| | | self.seg_jieba = False |
| | | if "seg_jieba" in train_args: |
| | | self.seg_jieba = train_args.seg_jieba |
| | | self.seg_dict_file = os.path.dirname(model_file)+"/"+ "jieba_usr_dict" |
| | | self.preprocessor = CodeMixTokenizerCommonPreprocessor( |
| | | train=False, |
| | | token_type=train_args.token_type, |
| | |
| | | g2p_type=train_args.g2p, |
| | | text_name="text", |
| | | non_linguistic_symbols=train_args.non_linguistic_symbols, |
| | | seg_jieba=self.seg_jieba, |
| | | seg_dict_file=self.seg_dict_file |
| | | ) |
| | | |
| | | @torch.no_grad() |
| | |
| | | else: |
| | | args.distributed = False |
| | | |
| | | if args.dataset_type == "small": |
| | | if args.batch_size is not None: |
| | | args.batch_size = args.batch_size * args.ngpu * args.num_worker_count |
| | | if args.batch_bins is not None: |
| | | args.batch_bins = args.batch_bins * args.ngpu * args.num_worker_count |
| | | |
| | | main(args=args) |
| | |
| | | symbol_table = read_symbol_table(args.token_list) |
| | | if hasattr(args, "seg_dict_file") and args.seg_dict_file is not None: |
| | | seg_dict = load_seg_dict(args.seg_dict_file) |
| | | if hasattr(args, "punc_dict_file") and args.punc_dict_file is not None: |
| | | punc_dict = read_symbol_table(args.punc_dict_file) |
| | | if hasattr(args, "punc_list") and args.punc_list is not None: |
| | | punc_dict = read_symbol_table(args.punc_list) |
| | | if hasattr(args, "bpemodel") and args.bpemodel is not None: |
| | | bpe_tokenizer = SentencepiecesTokenizer(args.bpemodel) |
| | | self.dataset_conf = args.dataset_conf |
| | | if "frontend_conf" not in args: |
| | | self.frontend_conf = None |
| | | else: |
| | | self.frontend_conf = args.frontend_conf |
| | | self.speed_perturb = args.speed_perturb if hasattr(args, "speed_perturb") else None |
| | | logging.info("dataloader config: {}".format(self.dataset_conf)) |
| | |
| | | |
| | | length = len(text) |
| | | if 'hw_tag' in data: |
| | | pre_index = None |
| | | if hw_config['pre_hwlist'] is not None and hw_config['pre_prob'] > 0: |
| | | # enable preset hotword detect in sampling |
| | | pre_index = None |
| | | for hw in hw_config['pre_hwlist']: |
| | | hw = " ".join(seg_tokenize(hw, seg_dict)) |
| | | _find = " ".join(text).find(hw) |
| | |
| | | import numpy as np |
| | | import scipy.signal |
| | | import soundfile |
| | | import jieba |
| | | |
| | | from funasr.text.build_tokenizer import build_tokenizer |
| | | from funasr.text.cleaner import TextCleaner |
| | |
| | | text_name: str = "text", |
| | | split_text_name: str = "split_text", |
| | | split_with_space: bool = False, |
| | | seg_jieba: bool = False, |
| | | seg_dict_file: str = None, |
| | | ): |
| | | super().__init__( |
| | |
| | | ) |
| | | # The data field name for split text. |
| | | self.split_text_name = split_text_name |
| | | self.seg_jieba = seg_jieba |
| | | if self.seg_jieba: |
| | | jieba.load_userdict(seg_dict_file) |
| | | |
| | | @classmethod |
| | | def split_words(cls, text: str): |
| | |
| | | words.append(current_word) |
| | | return words |
| | | |
| | | @classmethod |
| | | def isEnglish(cls, text:str): |
| | | if re.search('^[a-zA-Z\']+$', text): |
| | | return True |
| | | else: |
| | | return False |
| | | |
| | | @classmethod |
| | | def join_chinese_and_english(cls, input_list): |
| | | line = '' |
| | | for token in input_list: |
| | | if cls.isEnglish(token): |
| | | line = line + ' ' + token |
| | | else: |
| | | line = line + token |
| | | |
| | | line = line.strip() |
| | | return line |
| | | |
| | | @classmethod |
| | | def split_words_jieba(cls, text: str): |
| | | input_list = text.split() |
| | | token_list_all = [] |
| | | langauge_list = [] |
| | | token_list_tmp = [] |
| | | language_flag = None |
| | | for token in input_list: |
| | | if cls.isEnglish(token) and language_flag == 'Chinese': |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append('Chinese') |
| | | token_list_tmp = [] |
| | | elif not cls.isEnglish(token) and language_flag == 'English': |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append('English') |
| | | token_list_tmp = [] |
| | | |
| | | token_list_tmp.append(token) |
| | | |
| | | if cls.isEnglish(token): |
| | | language_flag = 'English' |
| | | else: |
| | | language_flag = 'Chinese' |
| | | |
| | | if token_list_tmp: |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append(language_flag) |
| | | |
| | | result_list = [] |
| | | for token_list_tmp, language_flag in zip(token_list_all, langauge_list): |
| | | if language_flag == 'English': |
| | | result_list.extend(token_list_tmp) |
| | | else: |
| | | seg_list = jieba.cut(cls.join_chinese_and_english(token_list_tmp), HMM=False) |
| | | result_list.extend(seg_list) |
| | | |
| | | return result_list |
| | | |
| | | def __call__( |
| | | self, uid: str, data: Dict[str, Union[list, str, np.ndarray]] |
| | | ) -> Dict[str, Union[list, np.ndarray]]: |
| | | # Split words. |
| | | if isinstance(data[self.text_name], str): |
| | | if self.seg_jieba: |
| | | # jieba.load_userdict(seg_dict_file) |
| | | split_text = self.split_words_jieba(data[self.text_name]) |
| | | else: |
| | | split_text = self.split_words(data[self.text_name]) |
| | | else: |
| | | split_text = data[self.text_name] |
| | |
| | | ) -> Dict[str, np.ndarray]: |
| | | for i in range(self.num_tokenizer): |
| | | text_name = self.text_name[i] |
| | | #import pdb; pdb.set_trace() |
| | | if text_name in data and self.tokenizer[i] is not None: |
| | | text = data[text_name] |
| | | text = self.text_cleaner(text) |
| | |
| | | ], |
| | | "scaler": scaler.state_dict() if scaler is not None else None, |
| | | "ema_model": model.encoder.ema.model.state_dict() |
| | | if hasattr(model.encoder, "ema") and model.encoder.ema is not None else None, |
| | | if hasattr(model, "encoder") and hasattr(model.encoder, "ema") and model.encoder.ema is not None else None, |
| | | }, |
| | | buffer, |
| | | ) |
| | |
| | | "nltk>=3.4.5", |
| | | # ASR |
| | | "sentencepiece", |
| | | "jieba", |
| | | # TTS |
| | | "pypinyin>=0.44.0", |
| | | "espnet_tts_frontend", |