From 90bc3ad02eee3745188be3960036ae3e9e746049 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 15 四月 2024 15:35:06 +0800
Subject: [PATCH] bugfix
---
funasr/models/sense_voice/model.py | 4 ++--
funasr/tokenizer/char_tokenizer.py | 3 ++-
examples/industrial_data_pretraining/paraformer/finetune.sh | 2 +-
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh
index 25d9e1a..fe511ff 100644
--- a/examples/industrial_data_pretraining/paraformer/finetune.sh
+++ b/examples/industrial_data_pretraining/paraformer/finetune.sh
@@ -10,7 +10,7 @@
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-
+model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py
index f8c1177..242664b 100644
--- a/funasr/models/sense_voice/model.py
+++ b/funasr/models/sense_voice/model.py
@@ -96,9 +96,9 @@
options = whisper.DecodingOptions(**DecodingOptions)
result = whisper.decode(self.model, speech, options)
-
+ text = f"{result.text}\n"
results = []
- result_i = {"key": key[0], "text": result.text}
+ result_i = {"key": key[0], "text": text}
results.append(result_i)
diff --git a/funasr/tokenizer/char_tokenizer.py b/funasr/tokenizer/char_tokenizer.py
index 2efc0b0..92c6e67 100644
--- a/funasr/tokenizer/char_tokenizer.py
+++ b/funasr/tokenizer/char_tokenizer.py
@@ -93,7 +93,8 @@
return seg_dict
def seg_tokenize(txt, seg_dict):
- pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$')
+ # pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$')
+ pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
out_txt = ""
for word in txt:
word = word.lower()
--
Gitblit v1.9.1