| funasr/bin/punc_infer.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/bin/punc_train.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/datasets/large_datasets/build_dataloader.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/datasets/preprocessor.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/train/trainer.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| setup.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
funasr/bin/punc_infer.py
@@ -8,6 +8,7 @@ import numpy as np import torch import os from funasr.build_utils.build_model_from_file import build_model_from_file from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor @@ -41,6 +42,11 @@ self.punc_list[i] = "?" elif self.punc_list[i] == "。": self.period = i self.seg_dict_file = None self.seg_jieba = False if "seg_jieba" in train_args: self.seg_jieba = train_args.seg_jieba self.seg_dict_file = os.path.dirname(model_file)+"/"+ "jieba_usr_dict" self.preprocessor = CodeMixTokenizerCommonPreprocessor( train=False, token_type=train_args.token_type, @@ -50,6 +56,8 @@ g2p_type=train_args.g2p, text_name="text", non_linguistic_symbols=train_args.non_linguistic_symbols, seg_jieba=self.seg_jieba, seg_dict_file=self.seg_dict_file ) @torch.no_grad() funasr/bin/punc_train.py
@@ -44,4 +44,10 @@ else: args.distributed = False if args.dataset_type == "small": if args.batch_size is not None: args.batch_size = args.batch_size * args.ngpu * args.num_worker_count if args.batch_bins is not None: args.batch_bins = args.batch_bins * args.ngpu * args.num_worker_count main(args=args) funasr/datasets/large_datasets/build_dataloader.py
@@ -69,11 +69,14 @@ symbol_table = read_symbol_table(args.token_list) if hasattr(args, "seg_dict_file") and args.seg_dict_file is not None: seg_dict = load_seg_dict(args.seg_dict_file) if hasattr(args, "punc_dict_file") and args.punc_dict_file is not None: punc_dict = read_symbol_table(args.punc_dict_file) if hasattr(args, "punc_list") and args.punc_list is not None: punc_dict = read_symbol_table(args.punc_list) if hasattr(args, "bpemodel") and args.bpemodel is not None: bpe_tokenizer = SentencepiecesTokenizer(args.bpemodel) self.dataset_conf = args.dataset_conf if "frontend_conf" not in args: self.frontend_conf = None else: self.frontend_conf = args.frontend_conf self.speed_perturb = args.speed_perturb if hasattr(args, "speed_perturb") else None logging.info("dataloader config: {}".format(self.dataset_conf)) funasr/datasets/preprocessor.py
@@ -11,6 +11,7 @@ import numpy as np import scipy.signal import soundfile import jieba from funasr.text.build_tokenizer import build_tokenizer from funasr.text.cleaner import TextCleaner @@ -628,6 +629,7 @@ text_name: str = "text", split_text_name: str = "split_text", split_with_space: bool = False, seg_jieba: bool = False, seg_dict_file: str = None, ): super().__init__( @@ -655,6 +657,9 @@ ) # The data field name for split text. self.split_text_name = split_text_name self.seg_jieba = seg_jieba if self.seg_jieba: jieba.load_userdict(seg_dict_file) @classmethod def split_words(cls, text: str): @@ -677,11 +682,72 @@ words.append(current_word) return words @classmethod def isEnglish(cls, text:str): if re.search('^[a-zA-Z\']+$', text): return True else: return False @classmethod def join_chinese_and_english(cls, input_list): line = '' for token in input_list: if cls.isEnglish(token): line = line + ' ' + token else: line = line + token line = line.strip() return line @classmethod def split_words_jieba(cls, text: str): input_list = text.split() token_list_all = [] langauge_list = [] token_list_tmp = [] language_flag = None for token in input_list: if cls.isEnglish(token) and language_flag == 'Chinese': token_list_all.append(token_list_tmp) langauge_list.append('Chinese') token_list_tmp = [] elif not cls.isEnglish(token) and language_flag == 'English': token_list_all.append(token_list_tmp) langauge_list.append('English') token_list_tmp = [] token_list_tmp.append(token) if cls.isEnglish(token): language_flag = 'English' else: language_flag = 'Chinese' if token_list_tmp: token_list_all.append(token_list_tmp) langauge_list.append(language_flag) result_list = [] for token_list_tmp, language_flag in zip(token_list_all, langauge_list): if language_flag == 'English': result_list.extend(token_list_tmp) else: seg_list = jieba.cut(cls.join_chinese_and_english(token_list_tmp), HMM=False) result_list.extend(seg_list) return result_list def __call__( self, uid: str, data: Dict[str, Union[list, str, np.ndarray]] ) -> Dict[str, Union[list, np.ndarray]]: # Split words. if isinstance(data[self.text_name], str): if self.seg_jieba: # jieba.load_userdict(seg_dict_file) split_text = self.split_words_jieba(data[self.text_name]) else: split_text = self.split_words(data[self.text_name]) else: split_text = data[self.text_name] @@ -782,7 +848,6 @@ ) -> Dict[str, np.ndarray]: for i in range(self.num_tokenizer): text_name = self.text_name[i] #import pdb; pdb.set_trace() if text_name in data and self.tokenizer[i] is not None: text = data[text_name] text = self.text_cleaner(text) funasr/train/trainer.py
@@ -369,7 +369,7 @@ ], "scaler": scaler.state_dict() if scaler is not None else None, "ema_model": model.encoder.ema.model.state_dict() if hasattr(model.encoder, "ema") and model.encoder.ema is not None else None, if hasattr(model, "encoder") and hasattr(model.encoder, "ema") and model.encoder.ema is not None else None, }, buffer, ) setup.py
@@ -23,6 +23,7 @@ "nltk>=3.4.5", # ASR "sentencepiece", "jieba", # TTS "pypinyin>=0.44.0", "espnet_tts_frontend",