From d74fdb3a7404cdbf28fd70849db6f9d4b87a7757 Mon Sep 17 00:00:00 2001
From: Zhihao Du <neo.dzh@alibaba-inc.com>
Date: 星期四, 15 六月 2023 22:11:06 +0800
Subject: [PATCH] Update test_sv_inference_pipeline.py (#635)
---
funasr/bin/asr_infer.py | 30 +++++++++++++++++++++---------
1 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index 9da7ef7..47ce0ee 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -305,6 +305,7 @@
nbest: int = 1,
frontend_conf: dict = None,
hotword_list_or_file: str = None,
+ decoding_ind: int = 0,
**kwargs,
):
assert check_argument_types()
@@ -415,6 +416,7 @@
self.nbest = nbest
self.frontend = frontend
self.encoder_downsampling_factor = 1
+ self.decoding_ind = decoding_ind
if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
self.encoder_downsampling_factor = 4
@@ -452,7 +454,7 @@
batch = to_device(batch, device=self.device)
# b. Forward Encoder
- enc, enc_len = self.asr_model.encode(**batch)
+ enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
@@ -491,9 +493,9 @@
else:
if pre_token_length[i] == 0:
yseq = torch.tensor(
- [self.asr_model.sos] + [self.asr_model.eos], device=yseq.device
+ [self.asr_model.sos] + [self.asr_model.eos], device=pre_acoustic_embeds.device
)
- score = torch.tensor(0.0, device=yseq.device)
+ score = torch.tensor(0.0, device=pre_acoustic_embeds.device)
else:
yseq = am_scores.argmax(dim=-1)
score = am_scores.max(dim=-1)[0]
@@ -1510,8 +1512,13 @@
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+ if self.frontend is not None:
+ speech = torch.unsqueeze(speech, axis=0)
+ speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ feats, feats_lengths = self.frontend(speech, speech_lengths)
+ else:
+ feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
if self.asr_model.normalize is not None:
feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
@@ -1536,14 +1543,19 @@
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
-
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+ if self.frontend is not None:
+ speech = torch.unsqueeze(speech, axis=0)
+ speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ feats, feats_lengths = self.frontend(speech, speech_lengths)
+ else:
+ feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
feats = to_device(feats, device=self.device)
feats_lengths = to_device(feats_lengths, device=self.device)
- enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
+ enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths)
nbest_hyps = self.beam_search(enc_out[0])
--
Gitblit v1.9.1