From 5358e1f10072d2c8ad7547fb82425e761b8e94f5 Mon Sep 17 00:00:00 2001
From: haoneng.lhn <haoneng.lhn@alibaba-inc.com>
Date: 星期三, 31 五月 2023 19:06:05 +0800
Subject: [PATCH] update

---
 funasr/bin/asr_infer.py |   27 ++++++++++++++++++++-------
 1 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index fc311c8..0e203c4 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -9,6 +9,7 @@
 import time
 import copy
 import os
+import re
 import codecs
 import tempfile
 import requests
@@ -304,6 +305,7 @@
             nbest: int = 1,
             frontend_conf: dict = None,
             hotword_list_or_file: str = None,
+            decoding_ind: int = 0,
             **kwargs,
     ):
         assert check_argument_types()
@@ -414,6 +416,7 @@
         self.nbest = nbest
         self.frontend = frontend
         self.encoder_downsampling_factor = 1
+        self.decoding_ind = decoding_ind
         if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
             self.encoder_downsampling_factor = 4
 
@@ -451,7 +454,7 @@
         batch = to_device(batch, device=self.device)
 
         # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode(**batch)
+        enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
         if isinstance(enc, tuple):
             enc = enc[0]
         # assert len(enc) == 1, len(enc)
@@ -1509,8 +1512,13 @@
         if isinstance(speech, np.ndarray):
             speech = torch.tensor(speech)
         
-        feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
-        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+        if self.frontend is not None:
+            speech = torch.unsqueeze(speech, axis=0)
+            speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:                
+            feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+            feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
         
         if self.asr_model.normalize is not None:
             feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
@@ -1535,14 +1543,19 @@
         
         if isinstance(speech, np.ndarray):
             speech = torch.tensor(speech)
-        
-        feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
-        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+        if self.frontend is not None:
+            speech = torch.unsqueeze(speech, axis=0)
+            speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:                
+            feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+            feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
         
         feats = to_device(feats, device=self.device)
         feats_lengths = to_device(feats_lengths, device=self.device)
         
-        enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
+        enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths)
         
         nbest_hyps = self.beam_search(enc_out[0])
         

--
Gitblit v1.9.1