From e2b3edec45fe1bfd76493acc366df971e89e7ae2 Mon Sep 17 00:00:00 2001
From: Xian Shi <40013335+R1ckShi@users.noreply.github.com>
Date: 星期四, 10 八月 2023 17:30:34 +0800
Subject: [PATCH] Merge pull request #830 from alibaba-damo-academy/dev_ts
---
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py | 68 ++++++++++++++++++++++++++++++++-
1 files changed, 65 insertions(+), 3 deletions(-)
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
index 9284943..f1fc9a0 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
@@ -6,11 +6,15 @@
from pathlib import Path
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import re
import numpy as np
import yaml
-from onnxruntime import (GraphOptimizationLevel, InferenceSession,
- SessionOptions, get_available_providers, get_device)
-
+try:
+ from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+ SessionOptions, get_available_providers, get_device)
+except:
+ print("please pip3 install onnxruntime")
+import jieba
import warnings
root_dir = Path(__file__).resolve().parent
@@ -230,6 +234,64 @@
words.append(current_word)
return words
+def isEnglish(text:str):
+ if re.search('^[a-zA-Z\']+$', text):
+ return True
+ else:
+ return False
+
+def join_chinese_and_english(input_list):
+ line = ''
+ for token in input_list:
+ if isEnglish(token):
+ line = line + ' ' + token
+ else:
+ line = line + token
+
+ line = line.strip()
+ return line
+
+def code_mix_split_words_jieba(seg_dict_file: str):
+ jieba.load_userdict(seg_dict_file)
+
+ def _fn(text: str):
+ input_list = text.split()
+ token_list_all = []
+ langauge_list = []
+ token_list_tmp = []
+ language_flag = None
+ for token in input_list:
+ if isEnglish(token) and language_flag == 'Chinese':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('Chinese')
+ token_list_tmp = []
+ elif not isEnglish(token) and language_flag == 'English':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('English')
+ token_list_tmp = []
+
+ token_list_tmp.append(token)
+
+ if isEnglish(token):
+ language_flag = 'English'
+ else:
+ language_flag = 'Chinese'
+
+ if token_list_tmp:
+ token_list_all.append(token_list_tmp)
+ langauge_list.append(language_flag)
+
+ result_list = []
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+ if language_flag == 'English':
+ result_list.extend(token_list_tmp)
+ else:
+ seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+ result_list.extend(seg_list)
+
+ return result_list
+ return _fn
+
def read_yaml(yaml_path: Union[str, Path]) -> Dict:
if not Path(yaml_path).exists():
raise FileExistsError(f'The {yaml_path} does not exist.')
--
Gitblit v1.9.1