From 94de39dde2e616a01683c518023d0fab72b4e103 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 19 二月 2024 22:21:50 +0800
Subject: [PATCH] aishell example
---
funasr/models/ct_transformer/utils.py | 81 ++++++++++++++++++++++++++++++++++++++++
1 files changed, 81 insertions(+), 0 deletions(-)
diff --git a/funasr/models/ct_transformer/utils.py b/funasr/models/ct_transformer/utils.py
index 0291dbc..01b1850 100644
--- a/funasr/models/ct_transformer/utils.py
+++ b/funasr/models/ct_transformer/utils.py
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
+import re
def split_to_mini_sentence(words: list, word_limit: int = 20):
@@ -12,3 +18,78 @@
if length % word_limit > 0:
sentences.append(words[sentence_len * word_limit:])
return sentences
+
+def split_words(text: str, jieba_usr_dict=None, **kwargs):
+ if jieba_usr_dict:
+ input_list = text.split()
+ token_list_all = []
+ langauge_list = []
+ token_list_tmp = []
+ language_flag = None
+ for token in input_list:
+ if isEnglish(token) and language_flag == 'Chinese':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('Chinese')
+ token_list_tmp = []
+ elif not isEnglish(token) and language_flag == 'English':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('English')
+ token_list_tmp = []
+
+ token_list_tmp.append(token)
+
+ if isEnglish(token):
+ language_flag = 'English'
+ else:
+ language_flag = 'Chinese'
+
+ if token_list_tmp:
+ token_list_all.append(token_list_tmp)
+ langauge_list.append(language_flag)
+
+ result_list = []
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+ if language_flag == 'English':
+ result_list.extend(token_list_tmp)
+ else:
+ seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+ result_list.extend(seg_list)
+
+ return result_list
+
+ else:
+ words = []
+ segs = text.split()
+ for seg in segs:
+ # There is no space in seg.
+ current_word = ""
+ for c in seg:
+ if len(c.encode()) == 1:
+ # This is an ASCII char.
+ current_word += c
+ else:
+ # This is a Chinese char.
+ if len(current_word) > 0:
+ words.append(current_word)
+ current_word = ""
+ words.append(c)
+ if len(current_word) > 0:
+ words.append(current_word)
+ return words
+
+def isEnglish(text:str):
+ if re.search('^[a-zA-Z\']+$', text):
+ return True
+ else:
+ return False
+
+def join_chinese_and_english(input_list):
+ line = ''
+ for token in input_list:
+ if isEnglish(token):
+ line = line + ' ' + token
+ else:
+ line = line + token
+
+ line = line.strip()
+ return line
--
Gitblit v1.9.1