From 79bd015ab0ded4e5aed1b1ecf32fcbc84eefde68 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 03 二月 2023 17:42:47 +0800
Subject: [PATCH] Merge pull request #58 from alibaba-damo-academy/dev
---
funasr/bin/asr_inference_paraformer.py | 21 ++++++++++++++-------
1 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 1a73457..0929436 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -95,10 +95,13 @@
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
@@ -166,7 +169,7 @@
self.converter = converter
self.tokenizer = tokenizer
is_use_lm = lm_weight != 0.0 and lm_file is not None
- if ctc_weight == 0.0 and not is_use_lm:
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
@@ -224,6 +227,8 @@
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
predictor_outs[2], predictor_outs[3]
pre_token_length = pre_token_length.round().long()
+ if torch.max(pre_token_length) < 1:
+ return []
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -259,7 +264,7 @@
token_int = hyp.yseq[1:last_pos].tolist()
# remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0, token_int))
+ token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
@@ -391,7 +396,7 @@
# results = speech2text(**batch)
# if len(results) < 1:
# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-# results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+# results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
# time_end = time.time()
# forward_time = time_end - time_beg
# lfr_factor = results[0][-1]
@@ -524,6 +529,7 @@
nbest: int = 1,
num_workers: int = 1,
output_dir: Optional[str] = None,
+ param_dict: dict = None,
**kwargs,
):
assert check_argument_types()
@@ -573,6 +579,7 @@
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
output_dir_v2: Optional[str] = None,
+ param_dict: dict = None,
):
# 3. Build data-iterator
if data_path_and_name_and_type is None and raw_inputs is not None:
@@ -618,7 +625,7 @@
results = speech2text(**batch)
if len(results) < 1:
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+ results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
time_end = time.time()
forward_time = time_end - time_beg
lfr_factor = results[0][-1]
@@ -650,7 +657,7 @@
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
if writer is not None:
- ibest_writer["text"][key] = text
+ ibest_writer["text"][key] = text_postprocessed
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
--
Gitblit v1.9.1