| | |
| | | from pathlib import Path |
| | | from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union |
| | | |
| | | import re |
| | | import numpy as np |
| | | import yaml |
| | | from onnxruntime import (GraphOptimizationLevel, InferenceSession, |
| | | SessionOptions, get_available_providers, get_device) |
| | | from typeguard import check_argument_types |
| | | |
| | | try: |
| | | from onnxruntime import (GraphOptimizationLevel, InferenceSession, |
| | | SessionOptions, get_available_providers, get_device) |
| | | except: |
| | | print("please pip3 install onnxruntime") |
| | | import jieba |
| | | import warnings |
| | | |
| | | root_dir = Path(__file__).resolve().parent |
| | |
| | | class TokenIDConverter(): |
| | | def __init__(self, token_list: Union[List, str], |
| | | ): |
| | | check_argument_types() |
| | | |
| | | self.token_list = token_list |
| | | self.unk_symbol = token_list[-1] |
| | |
| | | space_symbol: str = "<space>", |
| | | remove_non_linguistic_symbols: bool = False, |
| | | ): |
| | | check_argument_types() |
| | | |
| | | self.space_symbol = space_symbol |
| | | self.non_linguistic_symbols = self.load_symbols(symbol_value) |
| | |
| | | words.append(current_word) |
| | | return words |
| | | |
| | | def isEnglish(text:str): |
| | | if re.search('^[a-zA-Z\']+$', text): |
| | | return True |
| | | else: |
| | | return False |
| | | |
| | | def join_chinese_and_english(input_list): |
| | | line = '' |
| | | for token in input_list: |
| | | if isEnglish(token): |
| | | line = line + ' ' + token |
| | | else: |
| | | line = line + token |
| | | |
| | | line = line.strip() |
| | | return line |
| | | |
| | | def code_mix_split_words_jieba(seg_dict_file: str): |
| | | jieba.load_userdict(seg_dict_file) |
| | | |
| | | def _fn(text: str): |
| | | input_list = text.split() |
| | | token_list_all = [] |
| | | langauge_list = [] |
| | | token_list_tmp = [] |
| | | language_flag = None |
| | | for token in input_list: |
| | | if isEnglish(token) and language_flag == 'Chinese': |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append('Chinese') |
| | | token_list_tmp = [] |
| | | elif not isEnglish(token) and language_flag == 'English': |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append('English') |
| | | token_list_tmp = [] |
| | | |
| | | token_list_tmp.append(token) |
| | | |
| | | if isEnglish(token): |
| | | language_flag = 'English' |
| | | else: |
| | | language_flag = 'Chinese' |
| | | |
| | | if token_list_tmp: |
| | | token_list_all.append(token_list_tmp) |
| | | langauge_list.append(language_flag) |
| | | |
| | | result_list = [] |
| | | for token_list_tmp, language_flag in zip(token_list_all, langauge_list): |
| | | if language_flag == 'English': |
| | | result_list.extend(token_list_tmp) |
| | | else: |
| | | seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False) |
| | | result_list.extend(seg_list) |
| | | |
| | | return result_list |
| | | return _fn |
| | | |
| | | def read_yaml(yaml_path: Union[str, Path]) -> Dict: |
| | | if not Path(yaml_path).exists(): |
| | | raise FileExistsError(f'The {yaml_path} does not exist.') |
| | |
| | | logger.addHandler(sh) |
| | | logger_initialized[name] = True |
| | | logger.propagate = False |
| | | logging.basicConfig(level=logging.ERROR) |
| | | return logger |