From add315bdb35e09fe705d4eab39e4d2386734f4ae Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 17 三月 2023 15:50:51 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add
---
funasr/bin/asr_inference_mfcca.py | 2 --
funasr/datasets/preprocessor.py | 10 +++-------
funasr/datasets/large_datasets/utils/tokenize.py | 10 +++-------
tests/test_asr_inference_pipeline.py | 2 +-
4 files changed, 7 insertions(+), 17 deletions(-)
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 888d4d2..6f3dbb1 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -41,8 +41,6 @@
from funasr.utils import asr_utils, wav_utils, postprocess_utils
import pdb
-header_colors = '\033[95m'
-end_colors = '\033[0m'
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index caeb426..a016e4e 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -18,15 +18,11 @@
def seg_tokenize(txt, seg_dict):
out_txt = ""
- pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
- if pattern.match(word):
- if word in seg_dict:
- out_txt += seg_dict[word] + " "
- else:
- out_txt += "<unk>" + " "
+ if word in seg_dict:
+ out_txt += seg_dict[word] + " "
else:
- continue
+ out_txt += "<unk>" + " "
return out_txt.strip().split()
def tokenize(data,
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index 20a3791..98cca1d 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -47,15 +47,11 @@
def seg_tokenize(txt, seg_dict):
out_txt = ""
- pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
- if pattern.match(word):
- if word in seg_dict:
- out_txt += seg_dict[word] + " "
- else:
- out_txt += "<unk>" + " "
+ if word in seg_dict:
+ out_txt += seg_dict[word] + " "
else:
- continue
+ out_txt += "<unk>" + " "
return out_txt.strip().split()
def seg_tokenize_wo_pattern(txt, seg_dict):
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 32b8af5..b3c5a24 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -452,7 +452,7 @@
def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
+ model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
rec_result = inference_pipeline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',
param_dict={"decoding_model": "offline"})
--
Gitblit v1.9.1