| funasr/runtime/python/onnxruntime/demo_punc_offline.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
funasr/runtime/python/onnxruntime/demo_punc_offline.py
@@ -1,6 +1,7 @@ from funasr_onnx import CT_Transformer model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" #model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" model_dir = "damo/punc_ct-transformer_cn-en-common-vocab471067-large" model = CT_Transformer(model_dir) text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益" funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
@@ -10,7 +10,7 @@ from .utils.utils import (ONNXRuntimeError, OrtInferSession, get_logger, read_yaml) from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words) from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words,code_mix_split_words_jieba) logging = get_logger() @@ -65,9 +65,18 @@ self.punc_list[i] = "?" elif self.punc_list[i] == "。": self.period = i if "seg_jieba" in config: self.seg_jieba = True self.jieba_usr_dict_path = os.path.join(model_dir, 'jieba_usr_dict') self.code_mix_split_words_jieba = code_mix_split_words_jieba(self.jieba_usr_dict_path) else: self.seg_jieba = False def __call__(self, text: Union[list, str], split_size=20): split_text = code_mix_split_words(text) if self.seg_jieba: split_text = self.code_mix_split_words_jieba(text) else: split_text = code_mix_split_words(text) split_text_id = self.converter.tokens2ids(split_text) mini_sentences = split_to_mini_sentence(split_text, split_size) mini_sentences_id = split_to_mini_sentence(split_text_id, split_size) funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
@@ -6,11 +6,12 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union import re import numpy as np import yaml from onnxruntime import (GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers, get_device) import jieba import warnings root_dir = Path(__file__).resolve().parent @@ -230,6 +231,64 @@ words.append(current_word) return words def isEnglish(text:str): if re.search('^[a-zA-Z\']+$', text): return True else: return False def join_chinese_and_english(input_list): line = '' for token in input_list: if isEnglish(token): line = line + ' ' + token else: line = line + token line = line.strip() return line def code_mix_split_words_jieba(seg_dict_file: str): jieba.load_userdict(seg_dict_file) def _fn(text: str): input_list = text.split() token_list_all = [] langauge_list = [] token_list_tmp = [] language_flag = None for token in input_list: if isEnglish(token) and language_flag == 'Chinese': token_list_all.append(token_list_tmp) langauge_list.append('Chinese') token_list_tmp = [] elif not isEnglish(token) and language_flag == 'English': token_list_all.append(token_list_tmp) langauge_list.append('English') token_list_tmp = [] token_list_tmp.append(token) if isEnglish(token): language_flag = 'English' else: language_flag = 'Chinese' if token_list_tmp: token_list_all.append(token_list_tmp) langauge_list.append(language_flag) result_list = [] for token_list_tmp, language_flag in zip(token_list_all, langauge_list): if language_flag == 'English': result_list.extend(token_list_tmp) else: seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False) result_list.extend(seg_list) return result_list return _fn def read_yaml(yaml_path: Union[str, Path]) -> Dict: if not Path(yaml_path).exists(): raise FileExistsError(f'The {yaml_path} does not exist.')